In [75]:
import pandas as pd

train = pd.read_csv('data/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Preprocess Data

### Find Columns with Missing Values

In [76]:
counts = train.count()
counts[counts < train.shape[0]]

LotFrontage     1201
Alley             91
MasVnrType      1452
MasVnrArea      1452
BsmtQual        1423
BsmtCond        1423
BsmtExposure    1422
BsmtFinType1    1423
BsmtFinType2    1422
Electrical      1459
FireplaceQu      770
GarageType      1379
GarageYrBlt     1379
GarageFinish    1379
GarageQual      1379
GarageCond      1379
PoolQC             7
Fence            281
MiscFeature       54
dtype: int64

In [77]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [78]:
from parse_description import get_column_mappings, ordinal_columns, nominal_columns, numerical_columns 
from sklearn.preprocessing import OneHotEncoder



In [79]:
from parse_description import get_column_mappings, ordinal_columns as o_columns, nominal_columns as nom_columns, numerical_columns as num_columns 
from sklearn.preprocessing import OneHotEncoder

def preprocess(data: pd.DataFrame, one_hot_encoder: OneHotEncoder = None) -> pd.DataFrame:
    new_data = data.copy()

    # Fix datatypes of columns
    numerical_to_string_columns = { "MSSubClass"}
    for column in numerical_to_string_columns:
        new_data[column] = new_data[column].astype(str)

    # Drop all columns that have too many missing values
    new_data.drop(["Alley", "PoolQC", "Fence", "MiscFeature"], axis=1, inplace=True)
    nominal_columns = nom_columns.difference({"Alley"})
    ordinal_columns = o_columns.difference({"Fence", "PoolQC", "MiscFeature"})
    numerical_columns = num_columns

    # Encode ordinal columns
    column_mappings = get_column_mappings()
    for column in ordinal_columns:
        if column not in new_data: continue
        new_data[column] = new_data[column].map(column_mappings[column])

    # Replace values with missing data with the most common value
    ## Nominal Values
    nominal_value_map = {column : new_data[column].mode()[0] for column in nominal_columns}
    new_data = new_data.fillna(nominal_value_map)

    ## Ordinal Values
    ordinal_value_map = {column:new_data[column].median() for column in ordinal_columns}
    new_data.fillna(ordinal_value_map, inplace=True)
    
    ## Interval/Ratio Values
    numerical_value_map = {column:new_data[column].mean() for column in numerical_columns}
    new_data.fillna(numerical_value_map, inplace=True)

    
    # Encode nominal values
    one_hot_encoding_threshold = 6
    columns_to_one_hot_encode = [column for column in nominal_columns if len(column_mappings[column]) <= one_hot_encoding_threshold]
    encoder = one_hot_encoder if one_hot_encoder != None else OneHotEncoder(handle_unknown = 'ignore')
    encoder.fit(train[columns_to_one_hot_encode])
    encoded_columns = pd.DataFrame(encoder.transform(new_data[columns_to_one_hot_encode]).toarray(), columns = encoder.get_feature_names())
    new_data = pd.concat([new_data, encoded_columns], axis=1)
    new_data.drop(columns_to_one_hot_encode, axis=1, inplace=True)
    
    columns_to_label_encode = nominal_columns.difference(columns_to_one_hot_encode)
    for column in columns_to_label_encode:
        new_data[column] = new_data[column].map(column_mappings[column])

    return new_data, encoder
preprocessed_train, encoder = preprocess(train)
preprocessed_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 116 entries, Id to x10_nan
dtypes: float64(63), int64(53)
memory usage: 1.3 MB




In [80]:

def split_x_y(data: pd.DataFrame, target):
    X = data.drop(target, axis=1)
    Y = data[target]
    return X, Y

target_column = "SalePrice"
split_x_y(preprocessed_train, target_column)

(        Id  MSSubClass  MSZoning  LotFrontage  LotArea  LotShape  Utilities  \
 0        1          10         2         65.0     8450         3          3   
 1        2          15         2         80.0     9600         3          3   
 2        3          10         2         68.0    11250         2          3   
 3        4           9         2         60.0     9550         2          3   
 4        5          10         2         84.0    14260         2          3   
 ...    ...         ...       ...          ...      ...       ...        ...   
 1455  1456          10         2         62.0     7917         3          3   
 1456  1457          15         2         85.0    13175         3          3   
 1457  1458           9         2         66.0     9042         3          3   
 1458  1459          15         2         68.0     9717         3          3   
 1459  1460          15         2         75.0     9937         3          3   
 
       LandSlope  Neighborhood  Condit

In [81]:
from sklearn.model_selection import train_test_split

def process_data(data: pd.DataFrame):
    data, encoder = preprocess(data)
    X, Y = split_x_y(data, target_column)
    X_train, X_validate, y_train, y_validate = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train, X_validate, y_train, y_validate

X_train, X_validate, y_train, y_validate = process_data(train)



In [82]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 254 to 1126
Columns: 115 entries, Id to x10_nan
dtypes: float64(63), int64(52)
memory usage: 1.0 MB


In [83]:
# import cross validator
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor(),
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{model_name} score: {cross_val_score(model, X_validate, y_validate).mean()}")

Linear Regression score: 0.6814599059287207
Support Vector Regression score: -0.08435446842721021
Decision Tree Regression score: 0.5761226945002009
Random Forest Regression score: 0.8327610734122356


In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    'n_estimators': [10, 20, 50, 100, 200],
    'max_features': ['sqrt','log2'],
    'max_depth' : [3,4,5,6,7],
    'random_state' : [18]
}
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_validate, y_validate)

0.8902537125782057

## Create Solution

In [85]:
X_test = pd.read_csv('data/test.csv')
X_test_preprocessed, encoder = preprocess(X_test, one_hot_encoder = encoder)
X_test_preprocessed



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,Utilities,LandSlope,Neighborhood,Condition1,...,x8_Low,x8_Lvl,x9_N,x9_P,x9_Y,x10_BrkCmn,x10_BrkFace,x10_None,x10_Stone,x10_nan
0,1461,15,3,80.0,11622,3,3.0,2,12,7,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1462,15,2,81.0,14267,2,3.0,2,12,6,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1463,10,2,74.0,13830,2,3.0,2,16,6,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1464,10,2,78.0,9978,2,3.0,2,16,6,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1465,4,2,43.0,5005,2,3.0,2,2,6,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,2,0,21.0,1936,3,3.0,2,14,6,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1455,2916,2,0,21.0,1894,3,3.0,2,14,6,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1456,2917,15,2,160.0,20000,3,3.0,2,13,6,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1457,2918,6,2,62.0,10441,3,3.0,2,13,6,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [86]:
Y_output = model.predict(X_test_preprocessed)
Y_output

array([127395.84, 159630.  , 175516.53, ..., 151848.37, 115291.  ,
       219234.45])

In [87]:
data = pd.DataFrame({"Id": X_test["Id"], "SalePrice": Y_output})
data

Unnamed: 0,Id,SalePrice
0,1461,127395.84
1,1462,159630.00
2,1463,175516.53
3,1464,186078.50
4,1465,205351.09
...,...,...
1454,2915,88463.00
1455,2916,92182.00
1456,2917,151848.37
1457,2918,115291.00


In [89]:
data.to_csv("submisssion.csv", index=False)