In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling as pp
import scipy.stats as stats
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error



%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 999)

In [3]:
#imported both data sets

In [4]:
train = pd.read_csv('../datasets/train_clean.csv')

In [5]:
test = pd.read_csv('../datasets/test_clean.csv')

In [6]:
#Got shape of both to make sure they are the same. Test has one less column because it does not include the
#saleprice, which I will be predicting

In [10]:
#reruning model three without my dummies for the neighborhoods
features=['total_bsmt_sf','gr_liv_area', 'full_bath', 'half_bath', 'bsmt_full_bath',
        'bsmt_half_bath','wood_deck_sf', 'open_porch_sf', 'enclosed_porch',
        '3ssn_porch', 'screen_porch','house_age','lot_area','garage_cars']

In [11]:
X_train = train[features]
X_test = test[features]
y_train = train['saleprice']

In [12]:
#started with a regular linear regression

In [13]:
lr = LinearRegression()

In [14]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [15]:
lr.coef_

array([ 3.39988672e+01,  6.48478291e+01, -1.11530484e+03, -2.08999245e+03,
        1.26832313e+04, -1.10009851e+03,  3.65954055e+01,  4.07001756e+01,
        4.19762654e+01,  6.08045295e+01,  1.06295479e+02, -7.09961659e+02,
        3.75598748e-01,  1.88351558e+04])

In [16]:
lr.intercept_

25625.790496081463

In [17]:
cross_val_score(lr, X_train, y_train,cv=5).mean()

0.7336729917996949

In [18]:
#R2 of 0.7978181

In [19]:
predictions = lr.predict(X_train)

In [20]:
train['predictions'] = predictions

In [21]:
residuals = y_train - predictions

In [22]:
#created the function below for the last lab to print out metrics for my models

In [23]:
def met(y, pred):
    mae = metrics.mean_absolute_error(y,pred)
    mse = metrics.mean_squared_error(y,pred)
    rss = ((y - pred) ** 2).sum()
    rmse = np.sqrt(rss / len(pred))
    coef_derm = metrics.r2_score(y,pred)
    print('The Mean Absolute Error is',mae)
    print('The Mean Squared Erroris',mse)
    print('The Residual Sum of Squares is',rss)
    print('The Root Mean Squared Error is',rmse)
    print('The Coefficient of Determination is',coef_derm)

In [24]:
met(train['saleprice'],train['predictions'])

The Mean Absolute Error is 26080.407158968323
The Mean Squared Erroris 1615706545.3627536
The Residual Sum of Squares is 3313814124539.008
The Root Mean Squared Error is 40195.852340294434
The Coefficient of Determination is 0.7426756765693293


In [25]:
#I have variables of different units so I am going to standardize them

In [26]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [27]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.7426756765693293

In [28]:
#reran model against X from the test set

In [29]:
test_predict = lr.predict(X_test)

In [30]:
#created a new dataframe for me submission 

In [31]:
submission = pd.DataFrame(columns=[])

In [32]:
#assigned data to new dataframe

In [33]:
submission['Id'] = test['id']
submission['SalePrice'] = test_predict

In [34]:
submission.head()

Unnamed: 0,Id,SalePrice
0,2658,144356.552312
1,2718,244758.309879
2,2414,201966.928515
3,1989,109151.354069
4,625,204234.15946


In [35]:
submission.shape

(879, 2)

In [36]:
submission.to_csv('../submissions/submission_four.csv',index=False)