In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import TweedieRegressor
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_file = './data/train.csv'
test_file = './data/test.csv'
trainDF = pd.read_csv(train_file)
testDF = pd.read_csv(test_file)
pd.set_option('display.max_columns', None)

In [3]:
def doExperiment(trainInput, trainOutput, predictors, alg= LinearRegression()):
    cvMeanScore = model_selection.cross_val_score(alg, trainInput.loc[:, predictors], trainOutput, cv=10, scoring='r2', n_jobs=-1).mean()
    return cvMeanScore

In [4]:
def doKaggleTest(trainInput, testInput, trainOutput, testIDs, predictors):
    alg = LinearRegression()

    # Train the algorithm using all the training data
    alg.fit(trainInput.loc[:, predictors], trainOutput)

    # Make predictions on the test set.
    predictions = alg.predict(testInput.loc[:, predictors])

    # Create a new dataframe with only the columns Kaggle wants from the dataset.
    submission = pd.DataFrame({
        "Id": testIDs,
        "SalePrice": predictions
    })

    # Prepare CSV
    submission.to_csv('data/testResults.csv', index=False)
    # Now, this .csv file can be uploaded to Kaggle

In [5]:
# path = '/Users/moon/Downloads/test.txt'
# ls = []
# with open(path, 'r') as f:
#     for line in f:
#         data = line.split(' ')
#         if data[0] != 'def':
#             continue
#         a = line.replace('def ', '')
#         call1 = a.replace('targetDF', 'trainDF').replace('sourceDF', 'originalTrainDF').replace(':', '')
#         call2 = a.replace('targetDF', 'testDF').replace('sourceDF', 'originalTrainDF').replace(':', '')
#         ls.append(call1)
#         ls.append(call2)
#         ls.append('\n')
# s = '\n'.join(ls)
# print(s, sep='')

In [6]:
def transformData(trainDF, testDF):
    predictors = ['1stFlrSF', '2ndFlrSF', 'YearBuilt']
    
    originalTrainDF = trainDF.copy()
    preprocessAllColumns(trainDF, originalTrainDF)
    preprocessAllColumns(testDF, originalTrainDF)
    deletePoints(trainDF)
#     print(trainDF.head())
#     print(testDF.head())
    
    trainInput = trainDF.loc[:, predictors]
    testInput = testDF.loc[:, predictors]
    
    trainOutput = trainDF.loc[:, 'SalePrice']
    testIDs = testDF.loc[:, 'Id']
    
    return trainInput, testInput, trainOutput, testIDs, predictors

In [7]:
def preprocessAllColumns(targetDF, sourceDF):
    preMSSubClass(targetDF, sourceDF)
    preMSZoning(targetDF, sourceDF)
    preLotFrontage(targetDF, sourceDF)
    preStreet(targetDF, sourceDF)
    preLotShape(targetDF, sourceDF)
    preLandContour(targetDF, sourceDF)
    preUtilities(targetDF, sourceDF)
    preLotConfit(targetDF, sourceDF)
    preLandSlope(targetDF, sourceDF)
    preNeighborhood(targetDF, sourceDF)
    preConditions(targetDF, sourceDF)
    preBldgType(targetDF, sourceDF)
    preHouseStyle(targetDF, sourceDF)
    preRoofStyle(targetDF, sourceDF)
    preYearBuilt(targetDF, sourceDF)
    preRoofMatl(targetDF, sourceDF)
    preExteriors(targetDF, sourceDF)
    preMasVnrType(targetDF, sourceDF)
    preMasVnrArea(targetDF, sourceDF)
    preExterQual(targetDF, sourceDF)
    preExterCond(targetDF, sourceDF)
    preFoundation(targetDF, sourceDF)
    preBsmtQual(targetDF, sourceDF)
    preBsmtCond(targetDF, sourceDF)
    preBsmtExposure(targetDF, sourceDF)
    preBsmtFinType1(targetDF, sourceDF)
    preBsmtFinType2(targetDF, sourceDF)
    preBsmtFinSFs(targetDF, sourceDF)
    preHeatingQC(targetDF, sourceDF)
    preCentralAir(targetDF, sourceDF)
    preElectrical(targetDF, sourceDF)
    preKitchenQual(targetDF, sourceDF)
    preFunctional(targetDF, sourceDF)
    preFireplaceQu(targetDF, sourceDF)
    preGarageType(targetDF, sourceDF)
    preGarageYrBlt(targetDF, sourceDF)
    preGarageFinish(targetDF, sourceDF)
    preGarageQual(targetDF, sourceDF)
    preGarageCond(targetDF, sourceDF)
    prePavedDrive(targetDF, sourceDF)
    prePoolQC(targetDF, sourceDF)
    preFence(targetDF, sourceDF)
    preSaleType(targetDF, sourceDF)
    preSaleCondition(targetDF, sourceDF)
    
    preDropColumns(targetDF)

In [8]:
def preMSSubClass(targetDF, sourceDF):
    newDF = pd.get_dummies(targetDF, columns=['MSSubClass'])
    newColumns = list(set(newDF.columns) - set(targetDF.columns))
    targetDF.loc[:, newColumns] = newDF.loc[:, newColumns]

    
def preMSZoning(targetDF, sourceDF):
    newDF = pd.get_dummies(targetDF, columns=['MSZoning'])
    newColumns = list(set(newDF.columns) - set(targetDF.columns))
    targetDF.loc[:, newColumns] = newDF.loc[:, newColumns]

    
def preLotFrontage(targetDF, sourceDF):
    # replace nan with numbers
    targetDF.loc[:, 'LotFrontage'] = targetDF.loc[:, 'LotFrontage'].fillna(sourceDF.loc[:, 'LotFrontage'].median())
    
    
def preStreet(targetDF, sourceDF):
    # create two indicator variables
    targetDF.loc[:, 'Street_Grvl'] = targetDF.loc[:, 'Street'].map(lambda v: 1 if v=='Grvl' else 0)
    targetDF.loc[:, 'Street_Pave'] = targetDF.loc[:, 'Street'].map(lambda v: 1 if v=='Pave' else 0)
    

def preLotShape(targetDF, sourceDF):
    targetDF.loc[:, 'LotShape'] = targetDF.loc[:, 'LotShape'].map(lambda v: 3 if v=='Reg' else v)
    targetDF.loc[:, 'LotShape'] = targetDF.loc[:, 'LotShape'].map(lambda v: 2 if v=='IR1' else v)
    targetDF.loc[:, 'LotShape'] = targetDF.loc[:, 'LotShape'].map(lambda v: 1 if v=='IR2' else v)
    targetDF.loc[:, 'LotShape'] = targetDF.loc[:, 'LotShape'].map(lambda v: 0 if v=='IR3' else v)
    assert targetDF.loc[:, 'LotShape'].map(np.isreal).all()
    
    
def preLandContour(targetDF, sourceDF):
    targetDF.loc[:, 'LandContour'] = targetDF.loc[:, 'LandContour'].map(lambda v: 3 if v=='Lvl' else v)
    targetDF.loc[:, 'LandContour'] = targetDF.loc[:, 'LandContour'].map(lambda v: 2 if v=='Bnk' else v)
    targetDF.loc[:, 'LandContour'] = targetDF.loc[:, 'LandContour'].map(lambda v: 1 if v=='HLS' else v)
    targetDF.loc[:, 'LandContour'] = targetDF.loc[:, 'LandContour'].map(lambda v: 0 if v=='Low' else v)
    assert targetDF.loc[:, 'LandContour'].map(np.isreal).all()
    
    
def preUtilities(targetDF, sourceDF):
    targetDF.loc[:, 'Utilities'] = targetDF.loc[:, 'Utilities'].map(lambda v: 3 if v=='AllPub' else v)
    targetDF.loc[:, 'Utilities'] = targetDF.loc[:, 'Utilities'].map(lambda v: 2 if v=='NoSewr' else v)
    targetDF.loc[:, 'Utilities'] = targetDF.loc[:, 'Utilities'].map(lambda v: 1 if v=='NoSeWa' else v)
    targetDF.loc[:, 'Utilities'] = targetDF.loc[:, 'Utilities'].map(lambda v: 0 if v=='ELO' else v)
    assert targetDF.loc[:, 'Utilities'].map(np.isreal).all()
    
    
def preLotConfit(targetDF, sourceDF):
    pass


def preLandSlope(targetDF, sourceDF):
    targetDF.loc[:, 'LandSlope'] = targetDF.loc[:, 'LandSlope'].map(lambda v: 2 if v=='Gtl' else v)
    targetDF.loc[:, 'LandSlope'] = targetDF.loc[:, 'LandSlope'].map(lambda v: 1 if v=='Mod' else v)
    targetDF.loc[:, 'LandSlope'] = targetDF.loc[:, 'LandSlope'].map(lambda v: 0 if v=='Sev' else v)
    assert targetDF.loc[:, 'LandSlope'].map(np.isreal).all()
    

def preNeighborhood(targetDF, sourceDF):
    pass


def preConditions(targetDF, sourceDF):
    targetDF.loc[:, 'NoisyArea'] = targetDF.loc[:, 'Condition1'].map(lambda v: 1 if v!="Norm" else 0)
    

def preBldgType(targetDF, sourceDF):
    pass


def preHouseStyle(targetDF, sourceDF):
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 1 if v=='1Story' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 2 if v=='1.5Fin' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 1.5 if v=='1.5Unf' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 3 if v=='2Story' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 4 if v=='2.5Fin' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 3.5 if v=='2.5Unf' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 2 if v=='SFoyer' else v)
    targetDF.loc[:, 'HouseStyle'] = targetDF.loc[:, 'HouseStyle'].map(lambda v: 2 if v=='SLvl' else v)
    assert targetDF.loc[:, 'HouseStyle'].map(np.isreal).all()

    
def preYearBuilt(targetDF, sourceDF):
    targetDF.loc[:, 'YearsOld'] = diffFromMax(targetDF, sourceDF, 'YearBuilt')
    
    
def preRoofStyle(targetDF, sourceDF):
    pass


def preRoofMatl(targetDF, sourceDF):
    pass


def preExteriors(targetDF, sourceDF):
    targetDF = pd.get_dummies(targetDF, 'Exterior1st')


def preMasVnrType(targetDF, sourceDF):
    pass


def preMasVnrArea(targetDF, sourceDF):
    targetDF.loc[:, 'MasVnrArea'] = targetDF.loc[:, 'MasVnrArea'].fillna(sourceDF.loc[:, 'MasVnrArea'].median())


def preExterQual(targetDF, sourceDF):
    targetDF.loc[:, 'ExterQual'] = targetDF.loc[:, 'ExterQual'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'ExterQual'] = targetDF.loc[:, 'ExterQual'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'ExterQual'] = targetDF.loc[:, 'ExterQual'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'ExterQual'] = targetDF.loc[:, 'ExterQual'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'ExterQual'] = targetDF.loc[:, 'ExterQual'].map(lambda v: 0 if v=='Po' else v)
    assert targetDF.loc[:, 'ExterQual'].map(np.isreal).all() 

    
def preExterCond(targetDF, sourceDF):
    targetDF.loc[:, 'ExterCond'] = targetDF.loc[:, 'ExterCond'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'ExterCond'] = targetDF.loc[:, 'ExterCond'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'ExterCond'] = targetDF.loc[:, 'ExterCond'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'ExterCond'] = targetDF.loc[:, 'ExterCond'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'ExterCond'] = targetDF.loc[:, 'ExterCond'].map(lambda v: 0 if v=='Po' else v)
    assert targetDF.loc[:, 'ExterCond'].map(np.isreal).all()

    
def preFoundation(targetDF, sourceDF):
    pass


def preBsmtQual(targetDF, sourceDF):
    targetDF.loc[:, 'BsmtQual'] = targetDF.loc[:, 'BsmtQual'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'BsmtQual'] = targetDF.loc[:, 'BsmtQual'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'BsmtQual'] = targetDF.loc[:, 'BsmtQual'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'BsmtQual'] = targetDF.loc[:, 'BsmtQual'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'BsmtQual'] = targetDF.loc[:, 'BsmtQual'].map(lambda v: 0 if v=='Po' else v)
    targetDF.loc[:, 'BsmtQual'] = targetDF.loc[:, 'BsmtQual'].fillna(0)
    assert targetDF.loc[:, 'BsmtQual'].map(np.isreal).all() 


def preBsmtCond(targetDF, sourceDF):
    targetDF.loc[:, 'BsmtCond'] = targetDF.loc[:, 'BsmtCond'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'BsmtCond'] = targetDF.loc[:, 'BsmtCond'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'BsmtCond'] = targetDF.loc[:, 'BsmtCond'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'BsmtCond'] = targetDF.loc[:, 'BsmtCond'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'BsmtCond'] = targetDF.loc[:, 'BsmtCond'].map(lambda v: 0 if v=='Po' else v)
    targetDF.loc[:, 'BsmtCond'] = targetDF.loc[:, 'BsmtCond'].fillna(0)
    assert targetDF.loc[:, 'BsmtCond'].map(np.isreal).all()
    

def preBsmtExposure(targetDF, sourceDF):
    targetDF.loc[:, 'BsmtExposure'] = targetDF.loc[:, 'BsmtExposure'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'BsmtExposure'] = targetDF.loc[:, 'BsmtExposure'].map(lambda v: 2 if v=='Av' else v)
    targetDF.loc[:, 'BsmtExposure'] = targetDF.loc[:, 'BsmtExposure'].map(lambda v: 1 if v=='Mn' else v)
    targetDF.loc[:, 'BsmtExposure'] = targetDF.loc[:, 'BsmtExposure'].map(lambda v: 0 if v=='No' else v)
    targetDF.loc[:, 'BsmtExposure'] = targetDF.loc[:, 'BsmtExposure'].fillna(0)
    assert targetDF.loc[:, 'BsmtExposure'].map(np.isreal).all() 
    

def preBsmtFinType1(targetDF, sourceDF):
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].map(lambda v: 5 if v=='GLQ' else v)
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].map(lambda v: 4 if v=='ALQ' else v)
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].map(lambda v: 3 if v=='BLQ' else v)
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].map(lambda v: 2 if v=='Rec' else v)
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].map(lambda v: 1 if v=='LwQ' else v)
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].map(lambda v: 0 if v=='Unf' else v)
    targetDF.loc[:, 'BsmtFinType1'] = targetDF.loc[:, 'BsmtFinType1'].fillna(0)
    assert targetDF.loc[:, 'BsmtFinType1'].map(np.isreal).all() 


def preBsmtFinType2(targetDF, sourceDF):
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].map(lambda v: 5 if v=='GLQ' else v)
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].map(lambda v: 4 if v=='ALQ' else v)
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].map(lambda v: 3 if v=='BLQ' else v)
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].map(lambda v: 2 if v=='Rec' else v)
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].map(lambda v: 1 if v=='LwQ' else v)
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].map(lambda v: 0 if v=='Unf' else v)
    targetDF.loc[:, 'BsmtFinType2'] = targetDF.loc[:, 'BsmtFinType2'].fillna(0)
    assert targetDF.loc[:, 'BsmtFinType2'].map(np.isreal).all() 


def preBsmtFinSFs(targetDF, sourceDF):
    targetDF.loc[:, 'BsmtFinSFSum'] = targetDF.loc[:, 'BsmtFinSF1'] + targetDF.loc[:, 'BsmtFinSF2']
    

def preHeatingQC(targetDF, sourceDF):
    targetDF.loc[:, 'HeatingQC'] = targetDF.loc[:, 'HeatingQC'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'HeatingQC'] = targetDF.loc[:, 'HeatingQC'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'HeatingQC'] = targetDF.loc[:, 'HeatingQC'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'HeatingQC'] = targetDF.loc[:, 'HeatingQC'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'HeatingQC'] = targetDF.loc[:, 'HeatingQC'].map(lambda v: 0 if v=='Po' else v)
    assert targetDF.loc[:, 'HeatingQC'].map(np.isreal).all() 


def preCentralAir(targetDF, sourceDF):
    targetDF.loc[:, 'CentralAir'] = targetDF.loc[:, 'CentralAir'].map(lambda v: 1 if v=='Y' else v)
    targetDF.loc[:, 'CentralAir'] = targetDF.loc[:, 'CentralAir'].map(lambda v: 0 if v=='N' else v)
    assert targetDF.loc[:, 'CentralAir'].map(np.isreal).all() 
    

def preElectrical(targetDF, sourceDF):
    targetDF.loc[:, 'Electrical'] = targetDF.loc[:, 'Electrical'].map(lambda v: 3 if v=='SBrkr' else v)
    targetDF.loc[:, 'Electrical'] = targetDF.loc[:, 'Electrical'].map(lambda v: 2 if v=='FuseA' else v)
    targetDF.loc[:, 'Electrical'] = targetDF.loc[:, 'Electrical'].map(lambda v: 1 if v=='FuseF' else v)
    targetDF.loc[:, 'Electrical'] = targetDF.loc[:, 'Electrical'].map(lambda v: 0 if v=='FuseP' else v)
    targetDF.loc[:, 'Electrical'] = targetDF.loc[:, 'Electrical'].map(lambda v: 0 if v=='Mix' else v)

    
def preKitchenQual(targetDF, sourceDF):
    targetDF.loc[:, 'KitchenQual'] = targetDF.loc[:, 'KitchenQual'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'KitchenQual'] = targetDF.loc[:, 'KitchenQual'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'KitchenQual'] = targetDF.loc[:, 'KitchenQual'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'KitchenQual'] = targetDF.loc[:, 'KitchenQual'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'KitchenQual'] = targetDF.loc[:, 'KitchenQual'].map(lambda v: 0 if v=='Po' else v)
    assert targetDF.loc[:, 'KitchenQual'].map(np.isreal).all() 
    
    
def preFunctional(targetDF, sourceDF):
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 7 if v=='Sal' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 6 if v=='Sev' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 5 if v=='Maj2' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 4 if v=='Maj1' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 3 if v=='Mod' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 2 if v=='Min2' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 1 if v=='Min1' else v)
    targetDF.loc[:, 'Functional'] = targetDF.loc[:, 'Functional'].map(lambda v: 0 if v=='Typ' else v)
    assert targetDF.loc[:, 'Functional'].map(np.isreal).all() 

def preFireplaceQu(targetDF, sourceDF):
    targetDF.loc[:, 'FireplaceQu'] = targetDF.loc[:, 'FireplaceQu'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'FireplaceQu'] = targetDF.loc[:, 'FireplaceQu'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'FireplaceQu'] = targetDF.loc[:, 'FireplaceQu'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'FireplaceQu'] = targetDF.loc[:, 'FireplaceQu'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'FireplaceQu'] = targetDF.loc[:, 'FireplaceQu'].map(lambda v: 0 if v=='Po' else v)
    targetDF.loc[:, 'FireplaceQu'] = targetDF.loc[:, 'FireplaceQu'].fillna(0)
    assert targetDF.loc[:, 'FireplaceQu'].map(np.isreal).all() 


def preGarageYrBlt(targetDF, sourceDF):
    targetDF.loc[:, 'GarageYrBlt'] = targetDF.loc[:, 'GarageYrBlt'].fillna(sourceDF.loc[:, 'GarageYrBlt'].median())
    
    
def preGarageType(targetDF, sourceDF):
    pass


def preGarageFinish(targetDF, sourceDF):
    targetDF.loc[:, 'GarageFinish'] = targetDF.loc[:, 'GarageFinish'].map(lambda v: 2 if v=='Fin' else v)
    targetDF.loc[:, 'GarageFinish'] = targetDF.loc[:, 'GarageFinish'].map(lambda v: 1 if v=='RFn' else v)
    targetDF.loc[:, 'GarageFinish'] = targetDF.loc[:, 'GarageFinish'].map(lambda v: 0 if v=='Unf' else v)
    targetDF.loc[:, 'GarageFinish'] = targetDF.loc[:, 'GarageFinish'].fillna(0)
    assert targetDF.loc[:, 'GarageFinish'].map(np.isreal).all() 
    
    
def preGarageQual(targetDF, sourceDF):
    targetDF.loc[:, 'GarageQual'] = targetDF.loc[:, 'GarageQual'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'GarageQual'] = targetDF.loc[:, 'GarageQual'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'GarageQual'] = targetDF.loc[:, 'GarageQual'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'GarageQual'] = targetDF.loc[:, 'GarageQual'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'GarageQual'] = targetDF.loc[:, 'GarageQual'].map(lambda v: 0 if v=='Po' else v)
    targetDF.loc[:, 'GarageQual'] = targetDF.loc[:, 'GarageQual'].fillna(0)
    assert targetDF.loc[:, 'GarageQual'].map(np.isreal).all() 
    

def preGarageCond(targetDF, sourceDF):
    targetDF.loc[:, 'GarageCond'] = targetDF.loc[:, 'GarageCond'].map(lambda v: 4 if v=='Ex' else v)
    targetDF.loc[:, 'GarageCond'] = targetDF.loc[:, 'GarageCond'].map(lambda v: 3 if v=='Gd' else v)
    targetDF.loc[:, 'GarageCond'] = targetDF.loc[:, 'GarageCond'].map(lambda v: 2 if v=='TA' else v)
    targetDF.loc[:, 'GarageCond'] = targetDF.loc[:, 'GarageCond'].map(lambda v: 1 if v=='Fa' else v)
    targetDF.loc[:, 'GarageCond'] = targetDF.loc[:, 'GarageCond'].map(lambda v: 0 if v=='Po' else v)
    targetDF.loc[:, 'GarageCond'] = targetDF.loc[:, 'GarageCond'].fillna(0)
    assert targetDF.loc[:, 'GarageCond'].map(np.isreal).all() 
    

def prePavedDrive(targetDF, sourceDF):
    targetDF.loc[:, 'PavedDrive'] = targetDF.loc[:, 'PavedDrive'].map(lambda v: 2 if v=='Y' else v)
    targetDF.loc[:, 'PavedDrive'] = targetDF.loc[:, 'PavedDrive'].map(lambda v: 1 if v=='P' else v)
    targetDF.loc[:, 'PavedDrive'] = targetDF.loc[:, 'PavedDrive'].map(lambda v: 0 if v=='N' else v)
    assert targetDF.loc[:, 'PavedDrive'].map(np.isreal).all() 
    
    
def prePoolQC(targetDF, sourceDF):
    targetDF.loc[:, 'PoolQC'] = targetDF.loc[:, 'PoolQC'].map(lambda v: 3 if v=='Ex' else v)
    targetDF.loc[:, 'PoolQC'] = targetDF.loc[:, 'PoolQC'].map(lambda v: 2 if v=='Gd' else v)
    targetDF.loc[:, 'PoolQC'] = targetDF.loc[:, 'PoolQC'].map(lambda v: 1 if v=='TA' else v)
    targetDF.loc[:, 'PoolQC'] = targetDF.loc[:, 'PoolQC'].map(lambda v: 0 if v=='Fa' else v)
    targetDF.loc[:, 'PoolQC'] = targetDF.loc[:, 'PoolQC'].fillna(0)
    assert targetDF.loc[:, 'GarageCond'].map(np.isreal).all() 


def preFence(targetDF, sourceDF):
    targetDF.loc[:, 'Fence'] = targetDF.loc[:, 'Fence'].map(lambda v: 3 if v=='GdPrv' else v)
    targetDF.loc[:, 'Fence'] = targetDF.loc[:, 'Fence'].map(lambda v: 2 if v=='MnPrv' else v)
    targetDF.loc[:, 'Fence'] = targetDF.loc[:, 'Fence'].map(lambda v: 1 if v=='GdWo' else v)
    targetDF.loc[:, 'Fence'] = targetDF.loc[:, 'Fence'].map(lambda v: 0 if v=='MnWw' else v)
    targetDF.loc[:, 'Fence'] = targetDF.loc[:, 'Fence'].fillna(0)
    assert targetDF.loc[:, 'Fence'].map(np.isreal).all() 


def preSaleType(targetDF, sourceDF):
    pass


def preSaleCondition(targetDF, sourceDF):
    newDF = pd.get_dummies(targetDF, columns=['SaleCondition'])
    newColumns = list(set(newDF.columns) - set(targetDF.columns))
    targetDF.loc[:, newColumns] = newDF.loc[:, newColumns]
    

def preDropColumns(targetDF):
    replaced_cols = ['MSZoning', 'YearBuilt','SaleCondition']
    missing_cols = ['Alley', 'PoolQC', 'MiscFeature', 'FireplaceQu', 'LotFrontage']
    drop_cols = replaced_cols + missing_cols
    targetDF.drop(columns=drop_cols, inplace=True)

In [9]:
def deletePoints(df):
    # Delete an obs with Electrical missing
    df.drop(df.loc[df.loc[:, 'Electrical'].isna()].index, inplace=True)
    # Delete two outliers in terms of GrLivArea 
    GrLivArea_outliers = df.sort_values(by = 'GrLivArea', ascending = False)[:2].index
    df = df.drop(GrLivArea_outliers, inplace=True)

In [10]:
def standardize(targetDF, sourceDF, cols=None):
    if cols:
        mean = sourceDF.loc[:, cols].mean()
        std = sourceDF.loc[:, cols].std()
        standardizedDF = (targetDF.loc[:, cols] - mean)/std
    else:
        mean = sourceDF.loc[:, :].mean()
        std = sourceDF.loc[:, :].std()
        standardizedDF = (targetDF.loc[:, :] - mean)/std
    return standardizedDF

In [11]:
def diffFromMax(targetDF, sourceDF, cols):
    maxVal = sourceDF.loc[:, cols].max()
    diffFromMaxDF = -(targetDF.loc[:, cols] - maxVal)
    return diffFromMaxDF

In [12]:
def demonstrateHelpers(trainDF):
    print("Attributes with missing values:", getAttrsWithMissingValues(trainDF), sep='\n')
    
    numericAttrs = getNumericAttrs(trainDF)
    print("Numeric attributes:", numericAttrs, sep='\n')
    
    nonnumericAttrs = getNonNumericAttrs(trainDF)
    print("Non-numeric attributes:", nonnumericAttrs, sep='\n')

    print("Values, for each non-numeric attribute:", getAttrToValuesDictionary(trainDF.loc[:, nonnumericAttrs]), sep='\n')

# ===============================================================================
'''
Returns a dictionary mapping an attribute to the array of values for that attribute.
'''
def getAttrToValuesDictionary(df):
    attrToValues = {}
    for attr in df.columns.values:
        attrToValues[attr] = df.loc[:, attr].unique()

    return attrToValues

# ===============================================================================
'''
Returns the attributes with missing values.
'''
def getAttrsWithMissingValues(df):
    valueCountSeries = df.count(axis=0)  # 0 to count down the rows
    numCases = df.shape[0]  # Number of examples - number of rows in the data frame
    missingSeries = (numCases - valueCountSeries)  # A Series showing the number of missing values, for each attribute
    attrsWithMissingValues = missingSeries[missingSeries != 0].index
    return attrsWithMissingValues

# =============================================================================

'''
Returns the numeric attributes.
'''
def getNumericAttrs(df):
    return __getNumericHelper(df, True)

'''
Returns the non-numeric attributes.
'''
def getNonNumericAttrs(df):
    return __getNumericHelper(df, False)

def __getNumericHelper(df, findNumeric):
    isNumeric = df.applymap(np.isreal) # np.isreal is a function that takes a value and returns True (the value is real) or False
                                       # applymap applies the given function to the whole data frame
                                       # So this returns a DataFrame of True/False values indicating for each value in the original DataFrame whether it is real (numeric) or not

    isNumeric = isNumeric.all() # all: For each column, returns whether all elements are True
    attrs = isNumeric.loc[isNumeric==findNumeric].index # selects the values in isNumeric that are <findNumeric> (True or False)
    return attrs


In [13]:
def main():
    # Read the original data files
    trainDF = pd.read_csv("data/train.csv")
    testDF = pd.read_csv("data/test.csv")

#     demonstrateHelpers(trainDF)

    trainInput, testInput, trainOutput, testIDs, predictors = transformData(trainDF, testDF)
    cvMeanScore = doExperiment(trainInput, trainOutput, predictors)
    print("CV Average Score:", cvMeanScore)    
#     doKaggleTest(trainInput, testInput, trainOutput, testIDs, predictors)

In [None]:
main()

In [None]:
def forwardSelection():
    trainDF = pd.read_csv("data/train.csv")
#     trainDF = trainDF.loc[trainDF.loc[:, 'SalePrice'] > 339806]
#     testDF = pd.read_csv("data/test.csv")
    originalTrainDf = trainDF.copy()
    preprocessAllColumns(trainDF, originalTrainDf)
    numericAttrs = getNumericAttrs(trainDF)
    nonnumericAttrs = getNonNumericAttrs(trainDF)
#     print("Attributes with missing values:", getAttrsWithMissingValues(trainDF), sep='\n')
#     print("Numeric attributes:", numericAttrs)
#     print("Non-numeric attributes:", nonnumericAttrs)
    possibleAttrs = list(numericAttrs)
    possibleAttrs.remove('Id')
    possibleAttrs.remove('SalePrice')
    trainDF.loc[:, possibleAttrs] = standardize(trainDF, trainDF, possibleAttrs)
    predictors = ['OverallQual']
    possibleAttrs.remove('OverallQual')
    i = 0
    maxScore = -1
    maxAttr = ''
#     alg = Ridge(alpha=0.5)
    alg = TweedieRegressor(power=1, alpha=0.5, link='log')
#     alg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
#                                     max_depth=1, random_state=0, loss='ls')
    while i < 15:
        for attr in possibleAttrs:
            tmp_predictors = predictors.copy()
            tmp_predictors.append(attr)       
            trainInput = trainDF
            trainOutput = trainDF.loc[:, 'SalePrice']
            score = doExperiment(trainInput, trainOutput, tmp_predictors, alg)
            if score > maxScore:
                maxScore = score
                maxAttr = attr
        predictors.append(maxAttr)
        possibleAttrs.remove(maxAttr)
        print(f'{predictors}: {maxScore}')
        i += 1;

In [None]:
forwardSelection()

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def GBR():
    trainDF = pd.read_csv("data/train.csv")
    originalTrainDf = trainDF.copy()
    preprocessAllColumns(trainDF, originalTrainDf)
#     numericAttrs = getNumericAttrs(trainDF)
#     nonnumericAttrs = getNonNumericAttrs(trainDF)
#     possibleAttrs = list(numericAttrs)
#     possibleAttrs.remove('Id')
#     possibleAttrs.remove('SalePrice')
    possibleAttrs = ['OverallQual', 'GrLivArea', '2ndFlrSF', 'BsmtFinSF1', 'YearsOld', 'LotArea', 'TotalBsmtSF', 'BsmtQual', 'OverallCond', 'GarageCars', 'FireplaceQu', 'KitchenQual', 'ScreenPorch']
    trainDF.loc[:, possibleAttrs] = standardize(trainDF, trainDF, possibleAttrs)
    X, y = trainDF.loc[:, possibleAttrs], trainDF.loc[:, 'SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)
    params = {'n_estimators': 500,
              'max_depth': 4,
              'min_samples_split': 5,
              'learning_rate': 0.01,
              'loss': 'ls'}
    alg = GradientBoostingRegressor(**params)

    trainInput = trainDF.loc[:, possibleAttrs]
    trainOutput = trainDF.loc[:, 'SalePrice']
    score = doExperiment(trainInput, trainOutput, possibleAttrs, alg)
    print(f"Score is: {score}")
    #     print(getAttrsWithMissingValues(trainInput))
    #     print(getNonNumericAttrs(trainInput))
    
    alg.fit(X_train, y_train)
    mse = mean_squared_error(y_test, alg.predict(X_test))
    print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
    for i, y_pred in enumerate(alg.staged_predict(X_test)):
        test_score[i] = alg.loss_(y_test, y_pred)
    fig = plt.figure(figsize=(6, 6))
    plt.subplot(1, 1, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, alg.train_score_, 'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             label='Test Set Deviance')
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')
    fig.tight_layout()
    plt.show()
    
    feature_importance = alg.feature_importances_
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    fig = plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, np.array(possibleAttrs)[sorted_idx])
    plt.title('Feature Importance (MDI)')

    result = permutation_importance(alg, X_test, y_test, n_repeats=10,
                                    random_state=42, n_jobs=2)
    sorted_idx = result.importances_mean.argsort()
    plt.subplot(1, 2, 2)
    plt.boxplot(result.importances[sorted_idx].T,
                vert=False, labels=np.array(possibleAttrs)[sorted_idx])
    plt.title("Permutation Importance (test set)")
    fig.tight_layout()
    plt.show()

# GBR()

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def ensemble_selection():
    trainDF = pd.read_csv("data/train.csv")
    originalTrainDf = trainDF.copy()
    preprocessAllColumns(trainDF, originalTrainDf)
    numericAttrs = getNumericAttrs(trainDF)
    nonnumericAttrs = getNonNumericAttrs(trainDF)
    possibleAttrs = list(numericAttrs)
    possibleAttrs.remove('Id')
    possibleAttrs.remove('SalePrice')
    print(f'Candidate Attributes: {possibleAttrs}')
    breakpoint()
    trainDF.loc[:, possibleAttrs] = standardize(trainDF, trainDF, possibleAttrs)
    params = {'n_estimators': 200,
              'max_depth': 4,
              'min_samples_split': 5,
              'learning_rate': 0.01,
              'loss': 'ls'}
    alg = GradientBoostingRegressor(**params)
    predictors = ['OverallQual']
    possibleAttrs.remove('OverallQual')
    i = 0
    maxScore = -1
    maxAttr = ''
    while i < 15:
        for attr in possibleAttrs:
            tmp_predictors = predictors.copy()
            tmp_predictors.append(attr)       
            trainInput = trainDF
            trainOutput = trainDF.loc[:, 'SalePrice']
            score = doExperiment(trainInput, trainOutput, tmp_predictors, alg)
            if score > maxScore:
                maxScore = score
                maxAttr = attr
        predictors.append(maxAttr)
        possibleAttrs.remove(maxAttr)
        print(f'{predictors}: {maxScore}')
        i += 1;


        # ensemble_selection()

In [None]:
trainDF.loc[:, 'SalePrice'].head()

In [None]:
def plotHistOfSalePrice(df):
    salePrices = df.loc[:, 'SalePrice']
    print(salePrices.describe())
    # salePrices.hist(bins=50)
    sns.distplot(salePrices, fit=stats.norm)
plotHistOfSalePrice(trainDF)

In [None]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([trainDF['SalePrice'], trainDF[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
def checkCorr(df, k=10):
    # BEGIN: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
    # EXPLANATION: Visualization of correlation between SalePrice and k highly correlated elements
    corrmat = df.corr().abs()  # ADDED (.abs())
    color = plt.get_cmap('RdPu')  # ADDED
        
#     f, ax = plt.subplots(figsize=(12, 9))
#     sns.set(font_scale=0.7)
#     sns.heatmap(corrmat, cmap="RdPu", vmax=.8, square=True);
#     plt.show()
    
    f, ax = plt.subplots(figsize=(12, 9))
    cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
    cm = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=1.0)
    hm = sns.heatmap(cm, cmap=color, cbar=True, annot=True, square=True, 
                     fmt='.2f', annot_kws={'size': 10}, vmin=0.2, vmax=0.8,
                     yticklabels=cols.values, xticklabels=cols.values)
    plt.show()
    # END: from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python

In [None]:
checkCorr(trainDF, k=11)

pre_df = trainDF.copy()
preprocessAllColumns(pre_df, trainDF)
checkCorr(pre_df, k=16)

# std_df = standardize(pre_df, pre_df)
# checkCorr(std_df, k=11)

In [None]:
def pairplots(df):
    sns.set()
    sns.pairplot(df, height = 2.5)
    plt.show()

In [None]:
cols = ["SalePrice", "OverallQual", "GrLivArea", "ExterQual", "KitchenQual", "GarageCars", "BsmtQual", "1stFlrSF", "FullBath", "GarageFinish", "YearsOld"]
pairplots(pre_df.loc[:, cols])

In [None]:
def checkMissingValues(df):
    # BEGIN: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
    # EXPLANATION: Caclulate the number and percentage of missing values and diplay
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_data.head(20))
    # END: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
    

In [None]:
checkMissingValues(trainDF)
checkMissingValues(testDF)
checkMissingValues(pre_df)

In [None]:
def checkNormality(df, col):
    # BEGIN: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
    # EXPLANATION: Displays histogram and normal probability plot
    sns.distplot(df[col], fit=norm)
    fig = plt.figure()
    res = stats.probplot(df[col], plot=plt)
    # END: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python

In [None]:
def logTransformation(df):
    # BEGIN: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
    # EXPLANATION: Log transformation
    df['LogSalePrice'] = np.log(df['SalePrice'])
    df['LogGrLivArea'] = np.log(df['GrLivArea'])
    # END: from from https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python

In [None]:
# checkNormality(trainDF, 'SalePrice')
checkNormality(trainDF, 'GrLivArea')

In [None]:
logTransformation(trainDF)
# checkNormality(trainDF, 'LogSalePrice')
checkNormality(trainDF, 'LogGrLivArea')

In [None]:
plt.scatter(trainDF['LogGrLivArea'], trainDF['LogSalePrice'])

In [None]:
def calcCorr(df, col1, col2):
    return df.loc[:, col1].corr(df.loc[:, col2])

In [None]:
calcCorr(trainDF, 'LogSalePrice', 'LogGrLivArea')