In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

In [2]:
df_test = pd.read_csv('data/test.csv')
df_train = pd.read_csv('data/train.csv')
df_sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
def null_count(df):
    null_count = []
    for column in df:
        null_count.append(df[str(column)].isnull().sum())
    null_count = np.array(null_count).transpose()
    print(null_count)
    return

In [4]:
y = df_train.SalePrice
X = df_train.drop(['SalePrice', 'Id'], axis=1)

In [5]:
null_count(X)

[   0    0  259    0    0 1369    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    8    8    0    0
    0   37   37   38   37    0   38    0    0    0    0    0    0    1
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  690   81   81   81    0    0   81   81    0    0    0    0    0    0
    0 1453 1179 1406    0    0    0    0    0]


In [6]:
X.fillna({'LotFrontage': X.LotFrontage.median() \
          ,'Alley': 0, 'PoolQC': 0, 'Fence': 0, 'MiscFeature': 0 \
          , 'MasVnrType': 0, 'MasVnrArea': 0, 'BsmtQual': 0 \
          , 'BsmtCond': 0, 'BsmtExposure': 0, 'BsmtFinType1': 0 \
          , 'BsmtFinType2': 0, 'Electrical': 0, 'FireplaceQu': 0 \
          ,'GarageType': 0, 'GarageYrBlt': 0, 'GarageFinish': 0 \
          , 'GarageQual': 0, 'GarageCond': 0}, inplace=True)

In [7]:
X = pd.get_dummies(X)

In [19]:
params = {'max_depth': range(5,8)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
rf_clf = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
gs_rf_clf = GridSearchCV(rf_clf, params, cv=5)

In [37]:
gs_rf_clf.fit(X_train, y_train)
best_clf = gs_rf_clf.best_estimator_




In [20]:
logreg = LogisticRegression(n_jobs=-1, random_state=0)
logreg.fit(X_train, y_train)

LogisticRegression(n_jobs=-1, random_state=0)

In [21]:
y_pred_logreg = logreg.predict(X_test)

In [22]:
mean_squared_error(y_test, y_pred_logreg, squared=False)

63863.72190315318

In [42]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

#c_values = np.logspace(-2, 3, 500)
c_values = np.array(range(1,1000,200))

logit_searcher = LogisticRegressionCV(cv=5, random_state=0)

In [23]:
logit_searcher.fit(X_train, y_train)

NameError: name 'logit_searcher' is not defined

In [30]:
best_clf

RandomForestClassifier(criterion='entropy', max_depth=7, n_estimators=300,
                       random_state=0)

In [31]:
y_pred = best_clf.predict(X_test)

In [32]:
mean_squared_error(y_test, y_pred, squared=False)

36119.20209756001

In [44]:
null_count(df_test)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [8]:
df_test.drop(['Id'], axis=1, inplace=True)

In [9]:
df_test.fillna({'LotFrontage': df_test.LotFrontage.median() \
          ,'Alley': 0, 'PoolQC': 0, 'Fence': 0, 'MiscFeature': 0 \
          , 'MasVnrType': 0, 'MasVnrArea': 0, 'BsmtQual': 0 \
          , 'BsmtCond': 0, 'BsmtExposure': 0, 'BsmtFinType1': 0 \
          , 'BsmtFinType2': 0, 'Electrical': 0, 'FireplaceQu': 0 \
          ,'GarageType': 0, 'GarageYrBlt': 0, 'GarageFinish': 0 \
          , 'GarageQual': 0, 'GarageCond': 0}, inplace=True)

In [10]:
df_test.fillna(0, inplace=True)

In [11]:
df_test = pd.get_dummies(df_test)

In [12]:
df_test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,1,0,0,0,0,1,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [13]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [14]:
col_test = df_test.columns
col_train = X.columns

In [15]:
add_to_X = []
for col in col_test:
    if col not in col_train:
        add_to_X.append(col)
add_to_X

['MSZoning_0',
 'Utilities_0',
 'Exterior1st_0',
 'Exterior2nd_0',
 'KitchenQual_0',
 'Functional_0',
 'SaleType_0']

In [16]:
X[add_to_X] = pd.DataFrame(0, index=np.arange(X.shape[0]), columns=add_to_X)

In [17]:
add_to_df_test = []
for col in col_train:
    if col not in col_test:
        add_to_df_test.append(col)
add_to_df_test

['Utilities_NoSeWa',
 'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'HouseStyle_2.5Fin',
 'RoofMatl_ClyTile',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_Other',
 'Heating_Floor',
 'Heating_OthW',
 'Electrical_0',
 'Electrical_Mix',
 'GarageQual_Ex',
 'PoolQC_Fa',
 'MiscFeature_TenC']

In [18]:
df_test[add_to_df_test] = pd.DataFrame(0, index=np.arange(df_test.shape[0]), columns=add_to_df_test)

In [76]:
y_pred_test = best_clf.predict(df_test)

Feature names must be in the same order as they were in fit.



In [77]:
y_pred_test

array([144000, 165000, 140000, ..., 145000, 115000, 190000], dtype=int64)

In [50]:
df_sample_submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [None]:
test_ans = pd.DataFrame({'Id': pd.read_csv('data/test.csv').Id, 'SalePrice': y_pred_test})

In [None]:
test_ans

In [None]:
test_ans.to_csv('test_ans.csv',index=False)

In [None]:
pd.read_csv('test_ans.csv').head()