In [1]:
# imports

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV,Lasso, LassoCV, LogisticRegression
from sklearn.metrics import mean_squared_error



# pandas settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# seaborn settings
sns.set_style("whitegrid")

In [2]:
ames_cleaned = ("datasets/final_ames.csv")

ames = pd.read_csv(ames_cleaned)

ames.drop(columns = ["Unnamed: 0"], inplace = True)

In [3]:
ames.isnull().sum().sort_values(ascending = False)

saleprice               0
heating_qc              0
garage_area             0
garage_finish           0
fireplaces              0
totrms_abvgrd           0
bedroom_abvgr           0
gr_liv_area             0
second_flr_sf           0
first_flr_sf            0
total_bsmt_sf           0
garage_type_NA          0
bsmt_unf_sf             0
bsmtfin_sf_1            0
bsmt_exposure           0
bsmt_qual               0
year_remod/add          0
year_built              0
overall_cond            0
overall_qual            0
ms_subclass_20          0
ms_subclass_30          0
ms_subclass_50          0
ms_subclass_60          0
bsmtfin_type_1_NA       0
bsmtfin_type_1_GLQ      0
foundation_PConc        0
exterior_2nd_MetalSd    0
exterior_2nd_CmentBd    0
exterior_1st_MetalSd    0
exterior_1st_CemntBd    0
exterior_1st_BrkFace    0
house_style_2Story      0
neighborhood_Somerst    0
neighborhood_NridgHt    0
neighborhood_NoRidge    0
neighborhood_GrnHill    0
neighborhood_Crawfor    0
ms_zoning_RL

In [4]:
features = list(ames.columns)
features.remove("saleprice")

In [5]:
X = ames[features]
y = ames[["saleprice"]]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 7)

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1170, 41)
(502, 41)
(1170, 1)
(502, 1)


# Scaling data

In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Ridge alpha

In [9]:
r_alphas = np.logspace(0, 5, 200)
rid_cv = RidgeCV(alphas=r_alphas, cv = 5)
rid_cv = rid_cv.fit(X_train_scaled, y_train)

print (f'Best ridge alpha: {rid_cv.alpha_}')
print (f'Best ridge r2: {rid_cv.score(X_train_scaled, y_train)}')

Best ridge alpha: 19.116440753857027
Best ridge r2: 0.9084613484651742


# Lasso

In [10]:
y_rav = y_train.values.ravel()

l_alphas = np.arange(0.01, 0.20, 0.01)
las_cv = LassoCV(alphas=l_alphas, cv = 5, max_iter = 50000)
las_cv = las_cv.fit(X_train_scaled, y_rav)

print (f'Best Lasso alpha: {las_cv.alpha_}')
print (f'Best Lasso r2: {las_cv.score(X_train_scaled, y_rav)}')

Best Lasso alpha: 0.19
Best Lasso r2: 0.9087721632851198


# Fitting Models

In [11]:
# Instantiate
lr = LinearRegression()
rd = Ridge(alpha = rid_cv.alpha_)
ls = Lasso(alpha = las_cv.alpha_)

In [12]:
def validator(model, x, y, nfolds = 5):
    
    kf = KFold(nfolds, shuffle = True, random_state = 5)
    
    rmse = np.sqrt(-cross_val_score(model, x, y, cv = kf, scoring = 'neg_mean_squared_error'))
    r2 = cross_val_score(model, x, y, cv=kf)
    
    return f'mean RMSE: {round(rmse.mean(),5)},  mean CV r2: {round(r2.mean(),5)}'

In [13]:
dum = DummyRegressor()
validator(dum, X_train_scaled,y_train)

'mean RMSE: 54893.05087,  mean CV r2: -0.00534'

In [14]:
validator(lr, X_train_scaled, y_train)

'mean RMSE: 17460.69859,  mean CV r2: 0.89788'

In [15]:
validator(rd, X_train_scaled, y_train)

'mean RMSE: 17417.45479,  mean CV r2: 0.89837'

In [16]:
validator(ls, X_train_scaled, y_train)

'mean RMSE: 17469.34127,  mean CV r2: 0.89778'

# Testing

In [17]:
def rmse (y_t, y_p):
    return np.sqrt(mean_squared_error(y_t, y_p))

In [18]:
lr = LinearRegression()
rd = Ridge(alpha = rid_cv.alpha_)
ls = Lasso(alpha = las_cv.alpha_)
dum = DummyRegressor()

In [19]:

dum = dum.fit(X_train_scaled, y_train)

y_predict = dum.predict(X_test_scaled)

print (f' dum score:{(dum.score(X_test_scaled, y_test))}')
print (f' dum rmse: {(rmse(y_test, y_predict))}')

#-----

lr = lr.fit(X_train_scaled, y_train)

y_predict = lr.predict(X_test_scaled)

print (f' lr score: {(lr.score(X_test_scaled, y_test))}')
print (f' lr rmse: {(rmse(y_test, y_predict))}')


#------

rd = rd.fit(X_train_scaled, y_train)

y_predict = rd.predict(X_test_scaled)

print (f' rd score: {(rd.score(X_test_scaled, y_test))}')
print (f' rd rmse: {(rmse(y_test, y_predict))}')

#------

ls = ls.fit(X_train_scaled, y_train)

y_predict = ls.predict(X_test_scaled)

print (f' ls score: {(ls.score(X_test_scaled, y_test))}')
print (f' ls rmse: {(rmse(y_test, y_predict))}')

 dum score:-0.0009757460356012437
 dum rmse: 53119.18352916961
 lr score: 0.8818271804845101
 lr rmse: 18251.493689745574
 rd score: 0.8819603720471835
 rd rmse: 18241.205239591898
 ls score: 0.881830965932763
 ls rmse: 18251.201360940664


# Lasso feature selection

 lr score: 0.8818271804845101
 lr rmse: 18251.493689745574
 rd score: 0.8819603720471835
 rd rmse: 18241.205239591898
 ls score: 0.881830965932763
 ls rmse: 18251.201360940664


# Testing against kaggle

In [20]:
testcsv = ("datasets/test.csv")

test = pd.read_csv(testcsv)

In [21]:
# title cleanup

def edit_title (title):
    
    title = (title.replace(" ","_")).lower()
    
    return title

test.rename(columns = lambda i:edit_title(i), inplace = True)

# 3 column names start with numbers. Replacing numbers with strings.

test.rename(columns = {'1st_flr_sf':"first_flr_sf",  '2nd_flr_sf':'second_flr_sf',"3ssn_porch":"threessn_porch"}, inplace = True)

# Removing unwanted columns

no_data = ['id', 'pid','misc_feature', 'misc_val']

l_num = ["mas_vnr_area", "bsmtfin_sf_2","low_qual_fin_sf", "bsmt_half_bath", "half_bath", 
                          "kitchen_abvgr", "pool_area"]

l_nom = ["street","alley","land_contour","condition_1","condition_2","roof_matl","bsmtfin_type_2",
                         "heating","paved_drive","sale_type"]

l_ord = ["utilities", "land_slope","exter_cond","bsmt_cond","central_air", "electrical",
                          "functional", "garage_qual", "garage_cond", "pool_qc"]

low_cor = ["mo_sold","yr_sold"]

high_null = ['fireplace_qu','fence']

comb_list = no_data+l_num+l_nom+l_ord+low_cor+high_null

test.drop(columns = comb_list, inplace = True)

colinear = ['exter_qual', 'kitchen_qual','garage_yr_blt','garage_cars']

test.drop(columns = colinear, inplace = True)

# combining different types of porch sf to a single column

porch_sfs = ['open_porch_sf', 'enclosed_porch', 'threessn_porch','screen_porch']

test['porch_sf'] = test[porch_sfs].sum(axis = 1)

# dropping porch columns

test.drop(columns = porch_sfs, inplace = True)

# creating list of continuous data columns

num_cols = [i for i in test.columns if test[i].dtypes == int or test[i].dtypes == float]

# ms_subclass, while numerical in value, is nominal in nature. Removing it from the num_cols list.

# months and years should also be classified as ordinal rather than numerical data

list_nonnums = ["ms_subclass", "year_built", "year_remod/add"]

for i in list_nonnums:
    num_cols.remove(i)
    
# The remanining data had to be cross examined with the data dictionary to determine if it was nominal or ordinal.

# Listing out norminal data

cat_nom_cols = ['ms_subclass','ms_zoning',
 'lot_config',
 'neighborhood',
 'bldg_type',
 'house_style',
 'roof_style',
 'exterior_1st',
 'exterior_2nd',
 'mas_vnr_type',
 'foundation',
 'bsmtfin_type_1',
 'garage_type']

# Creating ordinal data list

cat_ord_cols = [i for i in test.columns if i not in num_cols and i not in cat_nom_cols]

# Changing all num_cols to floats.

for i in num_cols:
    test[i] = test[i].map(lambda x:float(x))

#for the "ms_subclass" column, data should be strings insted of floats

test["ms_subclass"] = test["ms_subclass"].map(lambda x:str(x))

# filling columns with "NA" or "0" as appropriate

test[cat_nom_cols] = test[cat_nom_cols].fillna("NA")
test[cat_ord_cols] = test[cat_ord_cols].fillna("NA")
test[num_cols] = test[num_cols].fillna(0)

# encoding

# lable encoding lot_shape

lot_shape_dict = {"NA":0, "Reg":1, "IR1": 2, "IR2": 3, "IR3":4}

# lable encoding exterqual, extercon, bsmtqual, bsmtcon, heatingqc, kitchenqual, fireplacequ, garagequal, garagecond
# poolqc

qualcon_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NA":0}

# lable encoding bsmtexposure

bsmtexp_dict = {"Gd":4, "Av":3, "Mn":2, "No":1, "NA":0}

# lable encoding garagefinish

garfin_dict = {"Fin":3, "RFn":2, "Unf":1, "NA":0}

# Combining dictionaries:

combine_dict = {**garfin_dict,**bsmtexp_dict,**qualcon_dict,**lot_shape_dict}

# applying dictionary to ordinal non numerical columns

for i in cat_ord_cols:
    if i not in list_nonnums:
        test[i] = test[i].map(combine_dict)
        
# converting nominal columns to dummies

dummy_noms = pd.get_dummies(test[cat_nom_cols], drop_first = True)

# dropping nominal columns

test.drop(columns = cat_nom_cols, inplace=True)

In [22]:
# merging 

final_test = test.merge(dummy_noms, left_index = True, right_index = True)

In [23]:
for i in features:
    if i not in final_test.columns:
        final_test[i] = 0

In [24]:
X_exam = final_test[features]

In [25]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y)
X_e_scaled = scaler.fit_transform(X_exam)

In [26]:
ls = ls.fit(X_scaled, y)

ls.score(X_scaled, y)

0.9034360689590297

In [27]:
predictions = ls.predict(X_e_scaled)

In [29]:
submission = pd.read_csv(testcsv)

submission['SalePrice'] = predictions
submission = submission[["Id", "SalePrice"]]
submission.to_csv('./datasets/kaggle_submission9.csv',index=False)

In [None]:
submission