# House Prices Kaggle


### Data Exploration

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../dataset/train.csv")
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [7]:
x = df.isna().sum()
x[x != 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [31]:
# Configurations
import warnings
warnings.filterwarnings('ignore')
from sklearn import set_config
set_config(display="diagram")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
# Preprocessing and pipeline
from sklearn.preprocessing import *
from sklearn.pipeline import *
from sklearn.compose import *
from sklearn.model_selection import *
from sklearn.decomposition import KernelPCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from dense_transformer import DenseTransformer
# Dimension reduction
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor

In [54]:
X = df.drop("SalePrice",axis=1)
y = df["SalePrice"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=0)

In [50]:
categoricals = make_column_selector(dtype_include=object)
numericals = make_column_selector(dtype_exclude=object)

oneHotEncoder = OneHotEncoder(handle_unknown="ignore")

numeric_transformer = Pipeline(steps=[
    ("Imputing", IterativeImputer()),
    ("Scaling", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("OrdinalEncoding", OrdinalEncoder(
        handle_unknown="use_encoded_value", unknown_value=999)),
    ("Imputing", IterativeImputer()),
    ("OneHotEncoding", oneHotEncoder)
])


preprocessing = ColumnTransformer(transformers=[
    ("numerics", numeric_transformer, numericals),
    ("categoricals", categorical_transformer, categoricals)
])

# Models definition
glm = LogisticRegression()
mlp = MLPRegressor(hidden_layer_sizes=350,alpha=0.1,n_iter_no_change=20,max_iter=1000)
trees = DecisionTreeRegressor()
knn = KNeighborsRegressor(10)
svc = SVR()
kmeans = KMeans()
forest = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()
pca = PCA(n_components=10)

list_predictors = {'knn' : knn,'Logistic regression' : glm,
                    'Decsion tree' :  trees,'neural network' : mlp,'svc' : svc , "K-means" : kmeans , "Random forest" : forest}

pipes = {}

for model in list_predictors:
    model_pipe = Pipeline(steps=[
    ("model", list_predictors[model])
    ])

    pipes[model] = Pipeline(steps=[
    ("Preprocessing", preprocessing),
    ("Converting to array" , DenseTransformer()),
    ("Dimentionality reduction" , pca),
    ("Model fitting", model_pipe)
    ])

In [55]:
pipes['knn']

In [56]:
fitted = pipes['knn'].fit(X_train,y_train)
fitted.score(X_test,y_test)

0.7839235017156513

In [61]:
results = {}
for pipe in pipes.keys():
    curr_pipe = pipes[pipe]
    curr_pipe.fit(X_train,y_train)
    results[pipe] =  cross_val_score(curr_pipe,X,y).mean()

In [62]:
results

{'knn': 0.7895506036721784,
 'Logistic regression': 0.017123287671232876,
 'Decsion tree': 0.7295100354659971,
 'neural network': -2.745872391919794,
 'svc': -0.050207688045494914,
 'K-means': -3714.449463188258,
 'Random forest': 0.018493150684931507}

In [60]:
kaggle_test = pd.read_csv("../dataset/test.csv")

result = fitted.predict(kaggle_test)
y_pred = pd.Series(result)
submission = pd.concat([kaggle_test.Id, y_pred], axis=1)
submission.columns = ["Id","SalePrice"]
submission.to_csv("result.csv", header=True, index=False)
