In [123]:
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.svm import SVC

from xgboost import XGBClassifier

In [6]:
print(sklearn.__version__)

0.23.1


In [2]:
df = pd.read_csv("../Data/Assignment3-TrainingData.csv")

In [8]:
df.columns

Index(['row ID', 'BATHRM', 'HF_BATHRM', 'HEAT', 'HEAT_D', 'AC', 'NUM_UNITS',
       'ROOMS', 'BEDRM', 'AYB', 'YR_RMDL', 'EYB', 'STORIES', 'SALEDATE',
       'PRICE', 'SALE_NUM', 'GBA', 'BLDG_NUM', 'STYLE', 'STYLE_D', 'STRUCT',
       'STRUCT_D', 'GRADE', 'GRADE_D', 'CNDTN', 'CNDTN_D', 'EXTWALL',
       'EXTWALL_D', 'ROOF', 'ROOF_D', 'INTWALL', 'INTWALL_D', 'KITCHENS',
       'FIREPLACES', 'USECODE', 'LANDAREA', 'GIS_LAST_MOD_DTTM', 'QUALIFIED'],
      dtype='object')

Firstly we want to create a base version

This won't have any pre-processing done to it. The way the Kaggle system works means we can use this to get a (vague) idea of whether we improved upon our model through data engineering.

In [50]:
feats = ['BATHRM', 'HF_BATHRM', 'HEAT', 'HEAT_D', 'AC', 'NUM_UNITS',
       'ROOMS', 'BEDRM', 'AYB', 'YR_RMDL', 'EYB', 'STORIES', 'SALEDATE',
       'PRICE', 'SALE_NUM', 'GBA', 'STYLE', 'STYLE_D', 'STRUCT',
       'STRUCT_D', 'GRADE', 'GRADE_D', 'CNDTN', 'CNDTN_D', 'EXTWALL',
       'EXTWALL_D', 'ROOF', 'ROOF_D', 'INTWALL', 'INTWALL_D', 'KITCHENS',
       'FIREPLACES', 'USECODE', 'LANDAREA']
#We have removed ROW_ID, BLDG_NUM, GIS_LAST_MOD_DTTM, QUALIFIED

x = df[feats]
y = df['QUALIFIED']

In [57]:
#Label Encoding for the String columns
le = preprocessing.LabelEncoder()

for column_name in x.columns:
    if x[column_name].dtype == object:
        x[column_name].fillna('MISSING_DATA', inplace=True)
        x[column_name] = le.fit_transform(x[column_name])
    else:
        x[column_name].fillna(-1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [58]:
x.head()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,HEAT_D,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,...,EXTWALL,EXTWALL_D,ROOF,ROOF_D,INTWALL,INTWALL_D,KITCHENS,FIREPLACES,USECODE,LANDAREA
0,3.0,1.0,7.0,13,3,2.0,11.0,5.0,1898.0,2007.0,...,14.0,6,2.0,0,6.0,3,2.0,4.0,24,1680
1,3.0,1.0,13.0,7,3,2.0,9.0,5.0,1910.0,2009.0,...,14.0,6,2.0,0,6.0,3,2.0,4.0,24,1680
2,2.0,1.0,7.0,13,3,1.0,11.0,3.0,1913.0,2012.0,...,14.0,6,13.0,10,6.0,3,1.0,0.0,13,2032
3,3.0,2.0,13.0,7,3,1.0,10.0,5.0,1913.0,-1.0,...,14.0,6,2.0,0,6.0,3,1.0,4.0,11,2196
4,3.0,1.0,7.0,13,3,2.0,7.0,3.0,1908.0,2008.0,...,14.0,6,2.0,0,6.0,3,2.0,1.0,24,1424


In [56]:
x.describe()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,HEAT_D,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,...,EXTWALL,EXTWALL_D,ROOF,ROOF_D,INTWALL,INTWALL_D,KITCHENS,FIREPLACES,USECODE,LANDAREA
count,74987.0,74986.0,74987.0,75007.0,75007.0,74987.0,74975.0,74983.0,74997.0,34534.0,...,74987.0,75007.0,74987.0,75007.0,74987.0,75007.0,74986.0,74986.0,75007.0,75007.0
mean,2.03566,0.608314,7.645952,8.028357,2.619955,1.195967,7.357506,3.372991,1930.247357,2000.967974,...,13.355608,8.299025,3.992239,4.778461,6.144772,3.706334,1.217521,0.619782,13.121695,3419.138054
std,1.067649,0.61716,5.038308,3.176552,0.489231,0.593972,2.353929,1.169221,90.077725,18.273343,...,3.938126,6.266998,3.35163,4.706385,1.96066,2.494306,0.624872,0.889498,3.977688,5519.538116
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,5.0,2.0,1.0,6.0,3.0,1914.0,1993.0,...,14.0,6.0,1.0,0.0,6.0,3.0,1.0,0.0,11.0,1600.0
50%,2.0,1.0,7.0,7.0,3.0,1.0,7.0,3.0,1930.0,2006.0,...,14.0,6.0,2.0,2.0,6.0,3.0,1.0,0.0,12.0,2370.0
75%,3.0,1.0,13.0,13.0,3.0,1.0,8.0,4.0,1947.0,2011.0,...,14.0,6.0,6.0,9.0,6.0,3.0,1.0,1.0,13.0,4200.0
max,24.0,11.0,13.0,14.0,3.0,6.0,101.0,54.0,2019.0,2019.0,...,24.0,25.0,15.0,16.0,11.0,12.0,44.0,13.0,81.0,691817.0


In [60]:
#Split into test and train datasets
seed = 42

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state = seed)

In [61]:
print(len(x))
print(len(x_train))
print(len(x_val))

75007
52504
22503


In [75]:
#Create the ensemble model using the Max Voting sklearn model.
model1 = LogisticRegression(multi_class='multinomial', random_state=1)
model2 = DecisionTreeClassifier(random_state=1)
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2)], voting='hard')
model.fit(x_train,y_train)
print(model.score(x_val,y_val))

y_pred = model.predict(x_val)
roc_auc_score(y_val, y_pred)

0.8279340532373461


0.809986343899288

In [74]:
roc_auc_score(y_val, y_pred)

0.809986343899288

In [79]:
y_pred

array([1, 1, 0, ..., 0, 1, 1], dtype=int64)

Lets make some predictions to upload

In [80]:
pred_df = pd.read_csv("../Data/Assignment3-UnknownData.csv")

In [86]:
p = pred_df[feats]

#Label Encoding for the String columns
le = preprocessing.LabelEncoder()

for column_name in x.columns:
    if p[column_name].dtype == object:
        p[column_name].fillna('MISSING_DATA', inplace=True)
        p[column_name] = le.fit_transform(p[column_name])
    else:
        p[column_name].fillna(-1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p[column_name] = le.fit_transform(p[column_name])


In [87]:
p

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,HEAT_D,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,...,EXTWALL,EXTWALL_D,ROOF,ROOF_D,INTWALL,INTWALL_D,KITCHENS,FIREPLACES,USECODE,LANDAREA
0,4.0,0.0,7.0,13,3,2.0,8.0,4.0,1910.0,1988.0,...,14.0,5,6.0,8,6.0,3,2.0,5.0,24,1680
1,3.0,1.0,13.0,7,3,2.0,8.0,5.0,1900.0,2003.0,...,14.0,5,2.0,0,6.0,3,2.0,3.0,24,1680
2,1.0,0.0,7.0,13,3,2.0,5.0,2.0,1917.0,1988.0,...,14.0,5,6.0,8,6.0,3,2.0,0.0,24,1261
3,3.0,1.0,13.0,7,3,2.0,8.0,4.0,1906.0,2011.0,...,14.0,5,6.0,8,6.0,3,2.0,1.0,24,1627
4,1.0,1.0,13.0,7,3,1.0,6.0,2.0,1908.0,1979.0,...,14.0,5,2.0,0,6.0,3,1.0,0.0,11,1424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32142,3.0,0.0,1.0,5,2,3.0,14.0,6.0,1953.0,1999.0,...,14.0,5,2.0,0,6.0,3,3.0,0.0,23,7811
32143,2.0,0.0,1.0,5,2,2.0,10.0,4.0,1953.0,-1.0,...,14.0,5,2.0,0,6.0,3,2.0,0.0,23,4292
32144,3.0,0.0,1.0,5,2,3.0,12.0,6.0,1953.0,-1.0,...,14.0,5,2.0,0,3.0,12,3.0,0.0,23,4232
32145,2.0,0.0,1.0,5,2,2.0,10.0,4.0,1953.0,-1.0,...,14.0,5,1.0,2,2.0,0,2.0,0.0,23,5837


In [88]:
p_pred = model.predict(p)

In [111]:
#Create the ensemble model using the Max Voting sklearn model.
#First submission - Scored 0.85777
model1 = LogisticRegression(multi_class='multinomial', random_state=1)
model2 = DecisionTreeClassifier(random_state=1)
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2)], voting='soft')
model.fit(x_train,y_train)
print(model.score(x_val,y_val))

y_pred = model.predict(x_val)
print(roc_auc_score(y_val, y_pred))

p_pred = model.predict(p)

upload_df = pd.DataFrame()
upload_df['row ID'] = pred_df['row ID']
upload_df['Predict-Qualified'] = p_pred
upload_df.to_csv(r'../Data/Ensemble_Prediction.csv', index = False)

0.8717948717948718
0.8689649279912601


In [117]:
#Create the ensemble model using the Max Voting sklearn model.
#Second & Third submission - Scored 0.89037
model1 = LogisticRegression(multi_class='multinomial', random_state=1)
model2 = DecisionTreeClassifier(random_state=1)
model3 = XGBClassifier(seed=seed)
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('xgb', model3)], voting='soft')
model.fit(x_train,y_train)
print(model.score(x_val,y_val))

y_pred = model.predict(x_val)
print(roc_auc_score(y_val, y_pred))

p_pred = model.predict(p)

upload_df = pd.DataFrame()
upload_df['row ID'] = pred_df['row ID']
upload_df['Predict-Qualified'] = p_pred
upload_df.to_csv(r'../Data/Ensemble_Prediction_Second.csv', index = False)

0.8939252544105231
0.8951497639186483


In [126]:
#Create the ensemble model using the Max Voting sklearn model.
#Fourth Submission - Scored 0.90499
model1 = LogisticRegression(multi_class='auto', random_state=seed)
model2 = DecisionTreeClassifier(random_state=seed)
model3 = XGBClassifier(seed=seed)
model4 = RandomForestClassifier(random_state=seed, criterion='entropy')
model5 = GradientBoostingClassifier(random_state=seed)
model6 = RandomForestClassifier(random_state=seed, criterion='gini')

model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('xgb', model3), ('rfc1', model4), ('gbc', model5), ('rfc2', model6)], voting='soft')
model.fit(x_train,y_train)
print(model.score(x_val,y_val))

y_pred = model.predict(x_val)
print(roc_auc_score(y_val, y_pred))

p_pred = model.predict(p)

upload_df = pd.DataFrame()
upload_df['row ID'] = pred_df['row ID']
upload_df['Predict-Qualified'] = p_pred
upload_df.to_csv(r'../Data/Ensemble_Prediction_Fourth.csv', index = False)

0.9101453139581389
0.9154014591483185
