In [298]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

In [299]:
train = pd.read_csv('../datasets/ames1_clean_dummy.csv', index_col = 0)

In [300]:
test = pd.read_csv('../datasets/ames1_clean_test_dummy.csv', index_col = 0)

In [301]:
train.head()

Unnamed: 0,id,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,exter_qual,...,paved_drive_Y,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,saleprice
0,109,60,69.0552,13517,6,8,1976,2005,289.0,4,...,1,0,0,0,0,0,0,0,1,130500
1,544,60,43.0,11492,7,5,1996,1997,132.0,4,...,1,0,0,0,0,0,0,0,1,220000
2,153,20,68.0,7922,5,7,1953,2007,0.0,3,...,1,0,0,0,0,0,0,0,1,109000
3,318,60,73.0,9802,5,5,2006,2007,0.0,3,...,1,0,0,0,0,0,0,0,1,174000
4,255,50,82.0,14235,6,8,1900,1993,0.0,3,...,0,0,0,0,0,0,0,0,1,138500


In [302]:
test.head()

Unnamed: 0,id,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,exter_qual,...,paved_drive_P,paved_drive_Y,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,2658,190,69.0,9142,6,8,1910,1950,0.0,3,...,0,1,0,0,0,0,0,0,0,1
1,2718,90,69.545961,9662,5,4,1977,1977,0.0,3,...,0,1,0,0,0,0,0,0,0,1
2,2414,60,58.0,17104,7,5,2006,2006,0.0,4,...,0,1,0,0,0,0,0,1,0,0
3,1989,30,60.0,8520,5,6,1923,2006,0.0,4,...,0,0,0,0,0,0,0,0,0,1
4,625,20,69.545961,9500,6,5,1963,1963,247.0,3,...,0,1,0,0,0,0,0,0,0,1


Create X and y

In [303]:
X = train.drop('saleprice', axis = 1)
y = train['saleprice']

In [304]:
X.shape

(2051, 130)

In [305]:
y.shape

(2051,)

In [306]:
test.shape

(878, 130)

In [307]:
poly = PolynomialFeatures(degree=2, interaction_only = False, include_bias = False)

X_overfit = poly.fit_transform(X)

In [308]:
poly.get_feature_names(X.columns)

['id',
 'ms_subclass',
 'lot_frontage',
 'lot_area',
 'overall_qual',
 'overall_cond',
 'year_built',
 'year_remod/add',
 'mas_vnr_area',
 'exter_qual',
 'exter_cond',
 'bsmt_qual',
 'bsmt_cond',
 'bsmt_exposure',
 'bsmtfin_type_1',
 'heating_qc',
 'bedroom_abvgr',
 'kitchen_abvgr',
 'kitchen_qual',
 'functional',
 'fireplaces',
 'garage_yr_blt',
 'garage_cars',
 'wood_deck_sf',
 'open_porch_sf',
 'enclosed_porch',
 '3ssn_porch',
 'screen_porch',
 'misc_val',
 'mo_sold',
 'yr_sold',
 'ttl_sf',
 'ttl_bath',
 'ms_zoning_FV',
 'ms_zoning_I (all)',
 'ms_zoning_RH',
 'ms_zoning_RL',
 'ms_zoning_RM',
 'lot_config_CulDSac',
 'lot_config_FR2',
 'lot_config_FR3',
 'lot_config_Inside',
 'neighborhood_Blueste',
 'neighborhood_BrDale',
 'neighborhood_BrkSide',
 'neighborhood_ClearCr',
 'neighborhood_CollgCr',
 'neighborhood_Crawfor',
 'neighborhood_Edwards',
 'neighborhood_Gilbert',
 'neighborhood_Greens',
 'neighborhood_IDOTRR',
 'neighborhood_MeadowV',
 'neighborhood_Mitchel',
 'neighborhood_NAm

In [309]:
X_overfit.shape

(2051, 8645)

In [310]:
# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=.7,
    random_state=42
)

In [311]:
# Scale our data
# Relabeling scaled data as "Z" is common. 
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [312]:
# Instantiate
ridge_model = Ridge(alpha = 10)

# Fit
ridge_model.fit(Z_train, y_train)

# Evaluate model using R2
print(f'train: {ridge_model.score(Z_train, y_train)}')
print(f'test: {ridge_model.score(Z_test, y_test)}')

train: 0.892831707315184
test: 0.8356402716711684


In [313]:
test.columns

Index(['id', 'ms_subclass', 'lot_frontage', 'lot_area', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'mas_vnr_area',
       'exter_qual',
       ...
       'paved_drive_P', 'paved_drive_Y', 'sale_type_CWD', 'sale_type_Con',
       'sale_type_ConLD', 'sale_type_ConLI', 'sale_type_ConLw',
       'sale_type_New', 'sale_type_Oth', 'sale_type_WD '],
      dtype='object', length=130)

In [314]:
#creating a variable named test_sub
# which contains my ridge model prediction. 
test_sub = ridge_model.predict(test)


In [315]:
#after predictions are made I pass test_sub as a new column to test df
test['saleprice'] = test_sub

In [316]:
#label test id and salesprice as submission
submission = test[['id', 'saleprice']]

In [317]:
submission.shape

(878, 2)

In [318]:
test.shape

(878, 131)

In [319]:
submission.head()

Unnamed: 0,id,saleprice
0,2658,23883250.0
1,2718,32485440.0
2,2414,12962980.0
3,1989,15600000.0
4,625,24231620.0


In [320]:
submission.to_csv('../datasets/ames1_sub1.csv', index = False)