In [3]:
# Importing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [4]:
# Reading in data
ames_houses = pd.read_csv('../datasets/train.csv')

ames_house = pd.read_csv('../datasets/test.csv')

In [5]:
ames_houses = ames_houses[ames_houses['Gr Liv Area'] < 5000]

ames_house = ames_house[ames_house['Gr Liv Area'] < 5000]

In [6]:
features = ['Garage Cars',
            'Total Bsmt SF',
            '1st Flr SF',
            'Lot Area',
            'Lot Config',
            'Overall Qual',
            'Exter Qual',
            'Exter Cond', 
            'Central Air',
            'Gr Liv Area',
            'Full Bath',
            'Functional',
           'Fireplaces',
           'Neighborhood',
            'Year Built',
            'Year Remod/Add',
            'Mas Vnr Area',
            'TotRms AbvGrd',
           'Garage Area']

ames_train = ames_houses[features].copy()

ames_test = ames_house[features].copy()

In [7]:
ames_train['Garage Area'] = ames_train['Garage Area'].fillna(0)

ames_test['Garage Area'] = ames_test['Garage Area'].fillna(0)

In [8]:
ames_train['Garage Cars'] = ames_train['Garage Cars'].fillna(0)

ames_test['Garage Cars'] = ames_test['Garage Cars'].fillna(0)

In [9]:
ames_train['Total Bsmt SF'] = ames_train['Total Bsmt SF'].fillna(0)

ames_test['Total Bsmt SF'] = ames_test['Total Bsmt SF'].fillna(0)

In [10]:
ames_train['Mas Vnr Area'] = ames_train['Mas Vnr Area'].fillna(0)

ames_test['Mas Vnr Area'] = ames_test['Mas Vnr Area'].fillna(0)

In [11]:
# Feature engineering:

ames_train['garage_space'] = ames_train['Garage Cars'] * ames_train['Garage Area']

ames_test['garage_space'] = ames_test['Garage Cars'] * ames_test['Garage Area']

In [12]:
# More feature engineering:

ames_train['lower_sq_ftg'] = ames_train['Total Bsmt SF'] * ames_train['1st Flr SF']

ames_test['lower_sq_ftg'] = ames_test['Total Bsmt SF'] * ames_test['1st Flr SF']

In [13]:
# More feature engineering:

ames_train['tot_sq_ftg'] = ames_train['Gr Liv Area'] * ames_train['Total Bsmt SF']

ames_test['tot_sq_ftg'] = ames_test['Gr Liv Area'] * ames_test['Total Bsmt SF']

In [14]:
ames_train['overall_living'] = ames_train['Gr Liv Area'] * ames_train['Overall Qual']

ames_test['overall_living'] = ames_test['Gr Liv Area'] * ames_test['Overall Qual']

In [15]:
ames_train['years'] = ames_train['Year Built'] * ames_train['Year Remod/Add']

ames_test['years'] = ames_test['Year Built'] * ames_test['Year Remod/Add']

In [16]:
# Combining neighborhoods into umbrella columns:

ames_train['Neighborhood'] = ames_train['Neighborhood'].map({'NAmes':'NAmes',
                                                 'CollgCr':'CollgCr',
                                                 'OldTown':'OldTown',
                                                 'Edwards':'Edwards',
                                                 'Somerst':'Somerst',
                                                 'NridgHt':'NridgHt',
                                                 'Gilbert':'Gilbert',
                                                 'Sawyer':'Sawyer',
                                                 'SawyerW':'SawyerW',
                                                 'Mitchel':'Mitchel',
                                                 'BrkSide':'BrkSide',
                                                 'Crawfor':'Crawfor',
                                                 'IDOTRR':'IDOTRR',
                                                 'Timber':'Timber',
                                                 'NoRidge':'NoRidge',
                                                 'StoneBr':'StoneBr',
                                                 'SWISU':'SWISU',
                                                 'ClearCr':'ClearCr',
                                                 'MeadowV':'MeadowV',
                                                 'Blmngtn':'Blmngtn',
                                                 'BrDale':'BrDale',
                                                 'Veenker':'Veenker',
                                                 'NPkVill':'NPkVill',
                                                 'Blueste':'Crawfor',
                                                 'Greens':'Somerst',
                                                 'GrnHill':'Timber',
                                                 'Landmrk':'Somerst',
                                                 'NWAmes' : 'NWAmes'           
                                                })

ames_test['Neighborhood'] = ames_test['Neighborhood'].map({'NAmes':'NAmes',
                                                 'CollgCr':'CollgCr',
                                                 'OldTown':'OldTown',
                                                 'Edwards':'Edwards',
                                                 'Somerst':'Somerst',
                                                 'NridgHt':'NridgHt',
                                                 'Gilbert':'Gilbert',
                                                 'Sawyer':'Sawyer',
                                                 'SawyerW':'SawyerW',
                                                 'Mitchel':'Mitchel',
                                                 'BrkSide':'BrkSide',
                                                 'Crawfor':'Crawfor',
                                                 'IDOTRR':'IDOTRR',
                                                 'Timber':'Timber',
                                                 'NoRidge':'NoRidge',
                                                 'StoneBr':'StoneBr',
                                                 'SWISU':'SWISU',
                                                 'ClearCr':'ClearCr',
                                                 'MeadowV':'MeadowV',
                                                 'Blmngtn':'Blmngtn',
                                                 'BrDale':'BrDale',
                                                 'Veenker':'Veenker',
                                                 'NPkVill':'NPkVill',
                                                 'Blueste':'Crawfor',
                                                 'Greens':'Somerst',
                                                 'GrnHill':'Timber',
                                                 'Landmrk':'Somerst',
                                                 'NWAmes' : 'NWAmes'           
                                                })

In [17]:
ames_train.isnull().sum()

Garage Cars       0
Total Bsmt SF     0
1st Flr SF        0
Lot Area          0
Lot Config        0
Overall Qual      0
Exter Qual        0
Exter Cond        0
Central Air       0
Gr Liv Area       0
Full Bath         0
Functional        0
Fireplaces        0
Neighborhood      0
Year Built        0
Year Remod/Add    0
Mas Vnr Area      0
TotRms AbvGrd     0
Garage Area       0
garage_space      0
lower_sq_ftg      0
tot_sq_ftg        0
overall_living    0
years             0
dtype: int64

In [18]:
ames_test.isnull().sum()

Garage Cars       0
Total Bsmt SF     0
1st Flr SF        0
Lot Area          0
Lot Config        0
Overall Qual      0
Exter Qual        0
Exter Cond        0
Central Air       0
Gr Liv Area       0
Full Bath         0
Functional        0
Fireplaces        0
Neighborhood      0
Year Built        0
Year Remod/Add    0
Mas Vnr Area      0
TotRms AbvGrd     0
Garage Area       0
garage_space      0
lower_sq_ftg      0
tot_sq_ftg        0
overall_living    0
years             0
dtype: int64

In [19]:
y = ames_houses['SalePrice']

In [20]:
X_train = pd.get_dummies(data=ames_train, columns=['Lot Config', 'Exter Qual',
                                                  'Exter Cond', 'Central Air',
                                                  'Functional', 'Neighborhood'], drop_first=True)
X_test = pd.get_dummies(data=ames_test, columns=['Lot Config', 'Exter Qual',
                                                  'Exter Cond', 'Central Air',
                                                  'Functional', 'Neighborhood'], drop_first=True)

In [21]:
X_train.shape

(2049, 60)

In [22]:
X_test.shape

(878, 58)

In [23]:
for column in X_train.columns:
    if column not in X_test.columns:
        print(column)

Functional_Sal
Functional_Sev


In [24]:
for column in X_test.columns:
    if column not in X_train.columns:
        print(column)

In [25]:
# For submission 2, dropped columns from X_train that were not in X_test

X_train = X_train.drop(columns=['Functional_Sal', 'Functional_Sev'])

In [26]:
X_train_1, X_test_1, y_train, y_test = train_test_split(X_train, y, random_state = 142)

In [27]:
lr_1 = LinearRegression()
lr_1.fit(X_train_1, y_train)

LinearRegression()

In [28]:
cross_val_score(lr_1, X_train_1, y_train).mean()

0.8936317135357102

In [29]:
y_train_log = y_train.map(np.log)

In [30]:
lr_1.score(X_train_1, y_train)

0.9097767821056452

In [31]:
lr_1.score(X_test_1, y_test)

0.9045606626966943

In [32]:
lr_2 = LinearRegression()
lr_2.fit(X_train_1, y_train_log)

LinearRegression()

In [33]:
lr_2.score(X_train_1, y_train_log)

0.8894967113222613

In [34]:
cross_val_score(lr_2, X_train_1, y_train_log).mean()

0.8747530172371685

In [35]:
y_train_preds = lr_1.predict(X_train)

In [36]:
y_preds_2 = lr_1.predict(X_test)

In [37]:
y_preds_log = lr_2.predict(X_test)

In [38]:
y_preds_log = np.exp(y_preds_log)

In [39]:
ames_test['SalePrice'] = y_preds_log

In [40]:
ames_test['SalePrice']

0      107299.430505
1      189606.493271
2      191295.680250
3      119912.127467
4      175316.831142
           ...      
873    181816.166446
874    211433.090848
875    135398.534926
876    111370.217794
877    131589.768983
Name: SalePrice, Length: 878, dtype: float64

In [41]:
ames_test['ID'] = ames_house['Id']

In [42]:
RMSE = np.sqrt(metrics.mean_squared_error(y, y_train_preds))

RMSE

23989.014118522497

In [43]:
submission_9 = ames_test[['ID']].copy()
submission_9['SalePrice'] = ames_test[['SalePrice']].copy()

In [45]:
submission_9.head()

Unnamed: 0,ID,SalePrice
0,2658,107299.430505
1,2718,189606.493271
2,2414,191295.68025
3,1989,119912.127467
4,625,175316.831142


In [47]:
submission_9.to_csv('../datasets/submission_9.csv', index=False)