## 0. Import Libraries

In [28]:
# import pathlib

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# processing
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

# Model evaluation
from sklearn.model_selection import cross_val_score

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# machine learning
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier

## 1. Import and Read Data 

In [2]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /Users/lamsis/Desktop/ML Projects/House Prices
100%|████████████████████████████████████████| 199k/199k [00:00<00:00, 1.11MB/s]
100%|████████████████████████████████████████| 199k/199k [00:00<00:00, 1.10MB/s]


In [3]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [29]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_ids = test["Id"]

In [30]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [31]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [32]:
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


## 2. Exploratory Data Analysis (EDA)

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [34]:
# Identify missing data with more than 40% missing

print('total data : ' + str(len(train)))

for col in train.columns:
    if (round(train[col].isnull().sum()/len(train)*100, 2) > 40):
        none = str(round(train[col].isnull().sum()/len(train)*100, 2)) + '%'
        print(col,':', none)

total data : 1460
Alley : 93.77%
FireplaceQu : 47.26%
PoolQC : 99.52%
Fence : 80.75%
MiscFeature : 96.3%


## 3. Data Processing

In [35]:
# Remove Alley, FireplaceQu, PoolQC, Fence, MiscFeature

def clean(train):
    
    train = train.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
    cat_var = train.select_dtypes(include = ['object']).columns
    num_var = train._get_numeric_data().columns
    
    # Fill in missing data
    for num_col in num_var:
        train[num_col] = train[num_col].fillna(train[num_col].mean())
            
    for cat_col in cat_var:
        train[cat_col] = train[cat_col].fillna(train[cat_col].mode()[0])

    return train

In [36]:
train = clean(train)
test = clean(test)

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [37]:
# Confirmation missing data are filled

print('total data : ' + str(len(train)))

for col in train.columns:
    none = str(round(train[col].isnull().sum()/len(train)*100, 2)) + '%'
    print(col,':', none)

total data : 1460
Id : 0.0%
MSSubClass : 0.0%
MSZoning : 0.0%
LotFrontage : 0.0%
LotArea : 0.0%
Street : 0.0%
LotShape : 0.0%
LandContour : 0.0%
Utilities : 0.0%
LotConfig : 0.0%
LandSlope : 0.0%
Neighborhood : 0.0%
Condition1 : 0.0%
Condition2 : 0.0%
BldgType : 0.0%
HouseStyle : 0.0%
OverallQual : 0.0%
OverallCond : 0.0%
YearBuilt : 0.0%
YearRemodAdd : 0.0%
RoofStyle : 0.0%
RoofMatl : 0.0%
Exterior1st : 0.0%
Exterior2nd : 0.0%
MasVnrType : 0.0%
MasVnrArea : 0.0%
ExterQual : 0.0%
ExterCond : 0.0%
Foundation : 0.0%
BsmtQual : 0.0%
BsmtCond : 0.0%
BsmtExposure : 0.0%
BsmtFinType1 : 0.0%
BsmtFinSF1 : 0.0%
BsmtFinType2 : 0.0%
BsmtFinSF2 : 0.0%
BsmtUnfSF : 0.0%
TotalBsmtSF : 0.0%
Heating : 0.0%
HeatingQC : 0.0%
CentralAir : 0.0%
Electrical : 0.0%
1stFlrSF : 0.0%
2ndFlrSF : 0.0%
LowQualFinSF : 0.0%
GrLivArea : 0.0%
BsmtFullBath : 0.0%
BsmtHalfBath : 0.0%
FullBath : 0.0%
HalfBath : 0.0%
BedroomAbvGr : 0.0%
KitchenAbvGr : 0.0%
KitchenQual : 0.0%
TotRmsAbvGrd : 0.0%
Functional : 0.0%
Fireplaces

In [38]:
from sklearn.preprocessing import LabelEncoder

def encode(train):
    le = LabelEncoder()
    cat_var = train.select_dtypes(include = ['object']).columns
    train[cat_var] = train[cat_var].apply(lambda col: le.fit_transform(col))
    return train

In [39]:
train = encode(train)
test = encode(test)

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,3,3,0,2,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,0,3,0,4,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,0,3,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,0,3,0,2,...,0,0,0,0,0,12,2008,8,4,250000


In [40]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.expand_frame_repr", True)
pd.set_option('display.width', 1000)

train.dtypes

Id                 int64
MSSubClass         int64
MSZoning           int64
LotFrontage      float64
LotArea            int64
Street             int64
LotShape           int64
LandContour        int64
Utilities          int64
LotConfig          int64
LandSlope          int64
Neighborhood       int64
Condition1         int64
Condition2         int64
BldgType           int64
HouseStyle         int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle          int64
RoofMatl           int64
Exterior1st        int64
Exterior2nd        int64
MasVnrType         int64
MasVnrArea       float64
ExterQual          int64
ExterCond          int64
Foundation         int64
BsmtQual           int64
BsmtCond           int64
BsmtExposure       int64
BsmtFinType1       int64
BsmtFinSF1         int64
BsmtFinType2       int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating            int64
HeatingQC          int64


In [41]:
# Drop passenger ID column from training set

train = train.drop('Id', axis = 1)
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,60,3,68.0,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,70,3,60.0,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,60,3,84.0,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


## 4. Modeling 

In [42]:
X_train = train.drop('SalePrice', axis = 1)
Y_train = train['SalePrice']
X_test = test.drop('Id', axis = 1).copy()

print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (1460, 74)
Y_train shape:  (1460,)
X_test shape:  (1459, 74)


In [45]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

2.12

## 5. Submission

In [47]:
# Create submission dataframe

submit = pd.DataFrame({'Id': test['Id'], 'SalePrice': Y_pred})
submit.head()

Unnamed: 0,Id,SalePrice
0,1461,140000
1,1462,190000
2,1463,140000
3,1464,140000
4,1465,140000


In [48]:
submit.shape

(1459, 2)

In [50]:
sample = pd.read_csv("sample_submission.csv")
sample.shape

(1459, 2)

In [51]:
submit.to_csv("submission.csv", index=False)

In [52]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Preliminary Model"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 11.3kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques