In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling as pp
import scipy.stats as stats
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score



%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 999)

In [3]:
#imported both data sets

In [4]:
train = pd.read_csv('../datasets/train_clean.csv')

In [5]:
test = pd.read_csv('../datasets/test_clean.csv')

In [6]:
#Got shape of both to make sure they are the same. Test has one less column because it does not include the
#saleprice, which I will be predicting

In [7]:
train.shape

(2051, 56)

In [8]:
test.shape

(879, 55)

In [9]:
#created dummy variables for neighborhoods in temporary dataframe

In [10]:
tmp_train_dum = pd.get_dummies(train['neighborhood'],drop_first=True)

In [11]:
tmp_test_dum = pd.get_dummies(test['neighborhood'],drop_first=True)

In [12]:
tmp_train_dum.columns

Index(['Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',
       'Edwards', 'Gilbert', 'Greens', 'GrnHill', 'IDOTRR', 'Landmrk',
       'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge',
       'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst',
       'StoneBr', 'Timber', 'Veenker'],
      dtype='object')

In [13]:
#joined temp dataframes back to original dataframes

In [14]:
train = train.join(tmp_train_dum)
test = test.join(tmp_test_dum)

In [15]:
#added dummy variables to original variables used in first model
features=['total_bsmt_sf','gr_liv_area', 'tot_bath', 'tot_outdoor','house_age','lot_area',
            'garage_cars','Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',
       'Edwards', 'Gilbert', 'Greens', 'GrnHill', 'IDOTRR', 'Landmrk',
       'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge',
       'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst',
       'StoneBr', 'Timber', 'Veenker']

In [16]:
X_train = train[features]
X_test = test[features]
y_train = train['saleprice']

In [17]:
#started with a regular linear regression

In [18]:
lr = LinearRegression()

In [19]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
lr.coef_

array([ 2.81224880e+01,  4.49505207e+01,  8.80154540e+03,  4.71842943e+01,
       -4.60356227e+02,  6.92272802e-01,  1.29472096e+04, -1.14636693e+04,
       -1.79934033e+04,  3.32507400e+03,  1.45453303e+03,  1.87842317e+03,
        2.97641516e+04, -1.76438193e+04, -8.34546339e+03,  1.52861883e+04,
        1.23989721e+05, -1.36774474e+04, -2.64511602e+04, -2.82265539e+04,
       -1.53305900e+04, -8.62389485e+03, -1.90922901e+04, -1.18007355e+04,
        4.40666329e+04,  7.06178106e+04, -8.71268140e+03, -6.99841072e+03,
       -1.01904615e+04, -1.00889448e+04,  1.87795707e+04,  8.36759390e+04,
        2.11328323e+04,  2.20602471e+04])

In [21]:
lr.intercept_

39720.318183907686

In [22]:
cross_val_score(lr, X_train, y_train,cv=5).mean()

0.7978181402481495

In [23]:
#R2 of 0.7978181

In [24]:
predictions = lr.predict(X_train)

In [25]:
train['predictions'] = predictions

In [26]:
residuals = y_train - predictions

In [27]:
#created the function below for the last lab to print out metrics for my models

In [28]:
def met(y, pred):
    mae = metrics.mean_absolute_error(y,pred)
    mse = metrics.mean_squared_error(y,pred)
    rss = ((y - pred) ** 2).sum()
    rmse = np.sqrt(rss / len(pred))
    coef_derm = metrics.r2_score(y,pred)
    print('The Mean Absolute Error is',mae)
    print('The Mean Squared Erroris',mse)
    print('The Residual Sum of Squares is',rss)
    print('The Root Mean Squared Error is',rmse)
    print('The Coefficient of Determination is',coef_derm)

In [29]:
met(train['saleprice'],train['predictions'])

The Mean Absolute Error is 22199.232322384578
The Mean Squared Erroris 1207191808.5719163
The Residual Sum of Squares is 2475950399381.0005
The Root Mean Squared Error is 34744.665900997185
The Coefficient of Determination is 0.8077374778957325


In [30]:
#I have variables of different units so I am going to standardize them

In [31]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [32]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.8077374778957324

In [33]:
#slightly improved R2

In [34]:
predictions = lr.predict(X_train)

In [35]:
train['predictions'] = predictions

In [36]:
#ran model against X from the test set

In [37]:
test_predict = lr.predict(X_test)

In [38]:
#created a new dataframe for me submission 

In [39]:
submission = pd.DataFrame(columns=[])

In [40]:
#submission.head()

In [41]:
#assigned data to dataframe

In [42]:
submission['Id'] = test['id']
submission['SalePrice'] = test_predict

In [43]:
submission.head()

Unnamed: 0,Id,SalePrice
0,2658,145679.957603
1,2718,218223.230742
2,2414,191125.824024
3,1989,142193.938871
4,625,177108.736181


In [44]:
submission.shape

(879, 2)

In [45]:
submission.to_csv('../submissions/submission_two.csv',index=False)