In [141]:
import pandas as pd

In [142]:
import numpy as np

In [143]:
import copy

In [144]:
from sklearn.model_selection import train_test_split

In [145]:
from sklearn.linear_model import LinearRegression

In [146]:
from sklearn.ensemble import RandomForestRegressor

In [147]:
from sklearn.metrics import mean_squared_error

In [148]:
from sklearn import metrics

In [149]:
df0 = pd.read_csv('train.csv')

In [153]:
def preprocessing(df):
    df['total_rooms'] = df['TotRmsAbvGrd'] + df['BsmtFullBath']
#     df['price_by_building'] = df['SalePrice'] / (df['GrLivArea'] + df['GarageArea'] + df['TotalBsmtSF'])
    df['qual_area_pound'] = df['OverallQual'] * df['GrLivArea']
    
    df.drop_duplicates()
#     drop outsider qual area pound
    df = df.drop(523)
    df = df.drop(1298)
#     drop outsider price by building
    df = df.drop(533)
#     create a bool columns for yearbuilt
    df['built_after_85'] = df['YearBuilt'] > 1985
#     df = df[['total_rooms', 'price_by_building', 'TotalBsmtSF', 'GarageArea', 'qual_area_pound', 'SalePrice']]
    df = df[['total_rooms', 'TotalBsmtSF', 'GarageArea', 'qual_area_pound', 'SalePrice']]
    return df
df = df0.copy()
df = preprocessing(df)
df

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound,SalePrice
0,9,856,548,11970,208500
1,6,1262,460,7572,181500
2,7,920,608,12502,223500
3,8,756,642,12019,140000
4,10,1145,836,17584,250000
...,...,...,...,...,...
1455,7,953,460,9882,175000
1456,8,1542,500,12438,210000
1457,9,1152,252,16380,266500
1458,6,1078,240,5390,142125


In [154]:
def normalise (data):
    for column in data.columns:
        data[column] = data[column] / data[column].mean()
    return data

In [155]:
X, y = df.drop(columns="SalePrice"), df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [156]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

0.8406487391876684

In [157]:
reg.score(X_test, y_test)

0.8194085936179499

In [158]:
reg.coef_

array([-2599.76442495,    46.40507556,    49.5569062 ,    11.65692334])

In [159]:
for name, feature in zip(['total_rooms',  'price_by_building', 'TotalBsmtSF', 'GarageArea', 'qual_area_pound'], reg.coef_):
    print('coef', name,' : ', feature)

coef total_rooms  :  -2599.7644249536775
coef price_by_building  :  46.40507555631835
coef TotalBsmtSF  :  49.556906200232156
coef GarageArea  :  11.656923336656329


In [160]:
reg.intercept_

15034.324525973934

In [161]:
df_mean = normalise(copy.deepcopy(df))

In [162]:
df_mean

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound,SalePrice
0,1.297161,0.812715,1.160098,1.243917,1.151742
1,0.864774,1.198185,0.973805,0.786879,1.002596
2,1.008903,0.873479,1.287116,1.299202,1.234601
3,1.153032,0.717772,1.359092,1.249009,0.773352
4,1.441290,1.087102,1.769784,1.827321,1.380986
...,...,...,...,...,...
1455,1.008903,0.904810,0.973805,1.026933,0.966690
1456,1.153032,1.464027,1.058483,1.292551,1.160028
1457,1.297161,1.093748,0.533476,1.702202,1.472131
1458,0.864774,1.023490,0.508072,0.560126,0.785090


In [163]:
X_mean, y_mean = df_mean.drop(columns="SalePrice"), df_mean["SalePrice"]
X_mean_train, X_mean_test, y_mean_train, y_mean_test = train_test_split(X_mean, y_mean)

In [164]:
reg_mean = LinearRegression().fit(X_mean_train, y_mean_train)
reg_mean.score(X_mean_train, y_mean_train)

0.839125960162157

In [165]:
reg_mean.score(X_mean_test, y_mean_test)

0.8221536690114812

In [166]:
reg_mean.coef_

array([-0.1142449 ,  0.26325788,  0.14812459,  0.62220511])

In [167]:
for name, feature in zip(['total_rooms',  'price_by_building', 'TotalBsmtSF', 'GarageArea', 'qual_area_pound'], reg_mean.coef_):
    print('coef', name,' : ', feature)

coef total_rooms  :  -0.11424489589822077
coef price_by_building  :  0.26325788057471494
coef TotalBsmtSF  :  0.1481245874674861
coef GarageArea  :  0.6222051081335318


# RANDOM FOREST

In [168]:
X, y = df.drop(columns="SalePrice"), df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [169]:
regr = RandomForestRegressor(max_depth=500, random_state=0)

In [170]:
regr_fit = regr.fit(X_train, y_train)

In [171]:
y_pred = regr_fit.predict(X_test)

In [172]:
mean_squared_error(y_pred, y_test)

1126320705.287599

In [173]:
np.sqrt(mean_squared_error(y_pred, y_test))

33560.701799688264

In [174]:
y_test.mean()

177171.42465753425

In [175]:
mean_squared_error(y_pred, y_test) / y_test.mean()**2

0.03588184064077337

In [176]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 21877.45820900195


In [177]:
importances_features = regr_fit.feature_importances_

In [178]:
for name, feature in zip(['total_rooms',  'price_by_building', 'TotalBsmtSF', 'GarageArea', 'qual_area_pound'], importances_features):
    print(name, round(feature*100,1),'%')

total_rooms 2.1 %
price_by_building 13.3 %
TotalBsmtSF 5.6 %
GarageArea 79.0 %


# RANDOM FOREST NORMALISE

In [179]:
X_mean, y_mean = df_mean.drop(columns="SalePrice"), df_mean["SalePrice"]
X_mean_train, X_mean_test, y_mean_train, y_mean_test = train_test_split(X_mean, y_mean)

In [180]:
regr_mean = RandomForestRegressor(max_depth=5, random_state=0)

In [181]:
regr_mean_fit = regr.fit(X_mean_train, y_mean_train)

In [182]:
y_mean_pred = regr_mean_fit.predict(X_mean_test)

In [183]:
mean_squared_error(y_mean_pred, y_mean_test)

0.027342883355051303

In [184]:
importances_features_mean = regr_mean_fit.feature_importances_

In [185]:
for name, feature in zip(['total_rooms',  'price_by_building', 'TotalBsmtSF', 'GarageArea', 'qual_area_pound'], importances_features_mean):
    print(name, round(feature*100,1),'%')

total_rooms 2.4 %
price_by_building 11.9 %
TotalBsmtSF 5.7 %
GarageArea 80.0 %


## TEST NORMALISE

In [186]:
test = X_mean_test[65:66]

In [187]:
prediction = regr_mean_fit.predict(test)

In [188]:
prediction

array([1.07441497])

In [189]:
test

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound
1132,1.153032,0.957029,0.433978,1.377973


In [204]:
df_mean.loc[(df_mean['qual_area_pound'] == 1.377973)]

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound,SalePrice


In [191]:
df_mean.iloc[1132]

total_rooms        1.008903
TotalBsmtSF        0.869681
GarageArea         0.973805
qual_area_pound    0.997628
SalePrice          0.933546
Name: 1134, dtype: float64

## NON NORMALISE

In [205]:
no_normalise_test = X_test[57:58]

In [206]:
no_normalise_test

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound
1277,8,1680,480,10344


In [207]:
regr_fit = regr.fit(X_train, y_train)

In [208]:
prediction_non_normalise = regr_fit.predict(no_normalise_test)

In [209]:
prediction_non_normalise

array([201431.05])

In [210]:
df.loc[(df['GarageArea'] == 480) & (df['TotalBsmtSF'] ==1680)]

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound,SalePrice
1277,8,1680,480,10344,197900


In [199]:
no_normalise_test = X_test[24:25]

In [200]:
no_normalise_test

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound
564,10,1095,590,16065


In [201]:
prediction_non_normalise = regr_fit.predict(no_normalise_test)

In [202]:
prediction_non_normalise

array([253399.13])

In [203]:
df.loc[(df['GarageArea'] == 590) & (df['TotalBsmtSF'] ==1095)]

Unnamed: 0,total_rooms,TotalBsmtSF,GarageArea,qual_area_pound,SalePrice
564,10,1095,590,16065,268000
