### loading libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### loading data

In [2]:
df=pd.read_csv('train.csv')
df.index = df.index + 1

In [3]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
2,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
3,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
4,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
5,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1457,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
#df.isnull().sum()

## exploring nan features , numerical features and catagarical features

In [5]:
nan_features_df=[feature for feature in df.columns if (df[feature].isnull().sum())>0]
nan_features_df

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [6]:
len(nan_features_df)

19

In [7]:
no_of_nan_cells=sum(list(df.isnull().sum()))
no_of_nan_cells

6965

In [8]:
total_cells=df.shape[0]*df.shape[1]
total_cells

118260

In [9]:
unempty_cells=total_cells-no_of_nan_cells
unempty_cells

111295

In [10]:
unempty_cells_percentage=unempty_cells/total_cells
unempty_cells_percentage

0.9411043463554879

In [11]:
len(nan_features_df)/df.shape[1]*100

23.456790123456788

In [12]:
df_nan=df[nan_features_df]
df_nan.isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [13]:
def make_dataframe_for_nan(dataframe):
    nan_features=[]
    nan_rows_feature=[]
    perc_nan_rows=[]
    rounded_perc_nan_rows=[]
    for feature in dataframe.columns:
        if (dataframe[feature].isnull().sum())>0:
            nan_rows=dataframe[feature].isnull().sum()
            nan_rows_per=nan_rows/len(dataframe)*100
            rounded_nan_rows_per=np.round(nan_rows_per)
            nan_features.append(feature)
            nan_rows_feature.append(nan_rows)
            perc_nan_rows.append(nan_rows_per)
            rounded_perc_nan_rows.append(rounded_nan_rows_per)
    nan= pd.DataFrame(list(zip(nan_features,nan_rows_feature,perc_nan_rows,rounded_perc_nan_rows)),columns =['feature Name','no of nan rows','percentage of nan rows','rounded percentage of nan rows'])
    nan.index = nan.index + 1
    return nan    

In [14]:
nan_dataframe=make_dataframe_for_nan(df)
nan_dataframe

Unnamed: 0,feature Name,no of nan rows,percentage of nan rows,rounded percentage of nan rows
1,LotFrontage,259,17.739726,18.0
2,Alley,1369,93.767123,94.0
3,MasVnrType,8,0.547945,1.0
4,MasVnrArea,8,0.547945,1.0
5,BsmtQual,37,2.534247,3.0
6,BsmtCond,37,2.534247,3.0
7,BsmtExposure,38,2.60274,3.0
8,BsmtFinType1,37,2.534247,3.0
9,BsmtFinType2,38,2.60274,3.0
10,Electrical,1,0.068493,0.0


In [15]:
high_nan_features=[]
for feature in df.columns:
    nan_rows=df[feature].isnull().sum()
    nan_rows_per=nan_rows/len(df)*100
    if nan_rows_per>=45:
        high_nan_features.append(feature)    

In [17]:
high_nan_features

['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

In [18]:
df=df.drop(high_nan_features,axis='columns')

In [19]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
numerical_features

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

### exploring catagarical features

In [20]:
catagarical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
catagarical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [21]:
len(catagarical_features)

38

In [22]:
df_catg=df[catagarical_features]

In [22]:
# i=0
# for feature in catagarical_features:
#     dummies= pd.get_dummies(df_catagarical[feature],drop_first=True)
#     i=i+1
#     merged = pd.concat([df_catagarical,dummies],axis='columns')
#     df_catagarical =merged.drop([feature], axis='columns')

In [23]:
def category_onehot_multcols(multcolumns):
    df_final=df_catg
    i=0
    for fields in multcolumns:
        
        #print(fields)
        df1=pd.get_dummies(df_catg[fields],drop_first=True)
        
        df_catg.drop([fields],axis=1,inplace=True)
        if i==0:
            df_final=df1.copy()
        else:
            
            df_final=pd.concat([df_final,df1],axis=1)
        i=i+1
       
        
    df_final=pd.concat([df_catg,df_final],axis=1)
        
    return df_final

In [24]:
df_catagarical=category_onehot_multcols(catagarical_features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [25]:
df_catagarical

Unnamed: 0,FV,RH,RL,RM,Pave,IR2,IR3,Reg,HLS,Low,...,ConLI,ConLw,New,Oth,WD,AdjLand,Alloca,Family,Normal,Partial
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1457,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1458,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1459,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


In [26]:
duplicated_columns=df_catagarical.columns[df_catagarical.columns.duplicated()]
len(duplicated_columns)

56

In [27]:
df_catagarical['Feedr']

Unnamed: 0,Feedr,Feedr.1
1,0,0
2,1,0
3,0,0
4,0,0
5,0,0
...,...,...
1456,0,0
1457,0,0
1458,0,0
1459,0,0


In [28]:
df_catagarical['TA']

Unnamed: 0,TA,TA.1,TA.2,TA.3,TA.4,TA.5,TA.6,TA.7
1,0,1,0,1,0,0,1,1
2,1,1,0,1,0,1,1,1
3,0,1,0,1,0,0,1,1
4,1,1,1,0,0,0,1,1
5,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...
1456,1,1,0,1,0,1,1,1
1457,1,1,0,1,1,1,1,1
1458,0,0,1,0,0,0,1,1
1459,1,1,1,1,0,0,1,1


In [29]:
df_catagarical=df_catagarical.drop(duplicated_columns,axis='columns')

In [30]:
df_catagarical.shape

(1460, 110)

### cheking_relationships

In [31]:
df_catagarical=pd.concat([df_catagarical,df['SalePrice']],axis='columns')

In [32]:
c_matrix_catg=df_catagarical.corr()

In [33]:
c_matrix_catg['SalePrice'].sort_values(ascending=False)

SalePrice    1.000000
PConc        0.497734
NridgHt      0.402149
New          0.357509
Partial      0.352060
               ...   
No          -0.263600
Reg         -0.267672
RM          -0.288065
Detchd      -0.354141
None        -0.374468
Name: SalePrice, Length: 111, dtype: float64

In [34]:
plus_fifty_features_catg=[]
for feature in c_matrix_catg['SalePrice'].sort_values(ascending=False).keys():
    if (c_matrix_catg['SalePrice'][feature]>=0.45) | (c_matrix_catg['SalePrice'][feature]<=-0.45):
        plus_fifty_features_catg.append(feature)  

In [35]:
plus_fifty_features_catg.remove('SalePrice')

In [36]:
plus_fifty_features_catg

['PConc']

***************************

***********************************************

In [37]:
numerical_features_2 = [feature for feature in df.columns if df[feature].dtypes != 'O']

In [38]:
len(numerical_features_2)/df.shape[1]*100

50.0

In [39]:
df_n=df[numerical_features_2]

In [40]:
df_n.shape

(1460, 38)

In [41]:
year_features= [feature for feature in numerical_features_2 if 'Yr' in feature or 'Year' in feature]

In [42]:
year_features

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

In [43]:
non_year_features=[feature for feature in numerical_features_2 if feature not in year_features]

In [44]:
len(non_year_features)

34

### handling with missing values two options

***option:1 filling missing values with zero and recheking***

In [45]:
#df_n=df_n.fillna(0)

In [46]:
#(df_n.isnull().sum()==0).value_counts()

***option:2 droping missing values***

In [47]:
#df=df.drop(high_nan_features,axis='columns')

In [48]:
#df_n=df_n.dropna()

In [49]:
(df_n.isnull().sum()==0).value_counts()

True     35
False     3
dtype: int64

***option:3 droping missing columns having more than 45% nan rows***

In [50]:
#df_n=df_n.drop()

In [51]:
dropped_rows=len(df)-len(df_n)

In [52]:
dropped_rows

0

In [53]:
rows_in_use=len(df)-dropped_rows
rows_in_use

1460

In [54]:
rows_in_use_percentage=rows_in_use/len(df)*100
rows_in_use_percentage

100.0

In [55]:
dropped_cells=dropped_rows*df.shape[1]
dropped_cells

0

In [56]:
cells_in_use=total_cells-dropped_cells
cells_in_use

118260

In [57]:
cells_in_use_percentage=cells_in_use/total_cells*100
cells_in_use_percentage

100.0

### exclode  ID 

In [58]:
df_n=df_n.drop(['Id'],axis='columns')

### cheking for relationship in features

In [59]:
c_matrix=df_n.corr()

In [60]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# #get correlations of each features in dataset
# top_corr_features = c_matrix.index
# plt.figure(figsize=(20,20))
# #plot heat map
# g=sns.heatmap(df_n[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [61]:
import seaborn as sns
#sns.relplot(x="CentralAir",y="SalePrice",data=df_n,kind='line')

In [62]:
#sns.relplot(x="Street",y="SalePrice",data=df_n,kind='line')

In [63]:
#sns.relplot(x="Utilities",y="SalePrice",data=df_n,kind='line')

In [64]:
len(c_matrix)

37

In [65]:
c_matrix['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePrice, dtype: float64

In [66]:
#c_matrix['SalePrice'][catg_2_features]

In [67]:
c_matrix['SalePrice'][year_features]

YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
YrSold         -0.028923
Name: SalePrice, dtype: float64

### excloding year features

In [68]:
#df_n=df[non_year_features]

In [69]:
#df_n

In [70]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# #get correlations of each features in dataset
# top_corr_features = c_matrix.index
# plt.figure(figsize=(20,20))
# #plot heat map
# g=sns.heatmap(df_n[top_corr_features].corr()['SalePrice'],annot=True,cmap="RdYlGn")

In [71]:
# import seaborn as sns
# sns.pairplot(df_n)

In [72]:
#c_matrix['SalePrice'].sort_values(ascending=False).keys()

In [73]:
#c_matrix['SalePrice']['FullBath']

In [74]:
plus_fifty_features=[]
for feature in c_matrix['SalePrice'].sort_values(ascending=False).keys():
    if c_matrix['SalePrice'][feature]>=0.5:
        plus_fifty_features.append(feature)  

In [75]:
plus_fifty_features.remove('SalePrice')

In [76]:
plus_fifty_features

['OverallQual',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'FullBath',
 'TotRmsAbvGrd',
 'YearBuilt',
 'YearRemodAdd']

In [77]:
len(plus_fifty_features)

10

In [78]:
c_matrix['SalePrice'].sort_values(ascending=False)[:11]

SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
Name: SalePrice, dtype: float64

In [79]:
#sns.relplot(x="OverallQual",y="SalePrice",data=df_n,kind='line')

In [80]:
#sns.relplot(x="PoolArea",y="SalePrice",data=df_n,kind='line')

In [81]:
#sns.relplot(x="KitchenAbvGr",y="SalePrice",data=df_n,kind='line')

### preparing dataframe for training

In [83]:
df_10=df_n[plus_fifty_features]

In [84]:
df_catg_1=df_catagarical[plus_fifty_features_catg]
df_catg_1

Unnamed: 0,PConc
1,1
2,0
3,1
4,0
5,1
...,...
1456,1
1457,0
1458,0
1459,0


In [85]:
df_11=pd.concat([df_catg_1,df_10],axis=1)

In [86]:
df_11_nan_features=[feature for feature in df_11.columns if (df_11[feature].isnull().sum())>0]
df_11_nan_features

[]

In [87]:
nan_dataframe=make_dataframe_for_nan(df_10)
nan_dataframe

Unnamed: 0,feature Name,no of nan rows,percentage of nan rows,rounded percentage of nan rows


In [88]:
#df_21=df_21.fillna(0)

In [89]:
#df_10['GarageYrBlt']=df_10['GarageYrBlt'].fillna(df_10['GarageYrBlt'].median())

In [90]:
#df_10['MasVnrArea']=df_10['MasVnrArea'].fillna(df_10['MasVnrArea'].median())

### droping unrelevant or extra features and preparing independent X and dependent y features

In [96]:
#X_p=df_n.drop(['SalePrice','Street','Utilities'],axis='columns')
#X=df_8
#X=df_10
X=df_11
#X=df_catg_11
y=df_n['SalePrice']

### with cross validation cheking the accuracy of all regression models

In [97]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model_scoring={}
def all_model_scores(model,X,y):
    scores=cross_val_score(model,X,y,cv=10)
    mean_score=scores.mean()
    model_scoring.update({model:mean_score})
    return model_scoring
Model_list=[LinearRegression(),DecisionTreeRegressor(),RandomForestRegressor()]
for model in Model_list:
    score_dict=all_model_scores(model,X,y)
df_Models_scores=pd.DataFrame(score_dict,index=[0])
df_Models_scores

Unnamed: 0,LinearRegression(),DecisionTreeRegressor(),RandomForestRegressor()
0,0.767296,0.750622,0.842273


### spliting into test and train data

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

### percentage of X_p features out of total features

In [99]:
# X_p.shape[1]/df.shape[1]*100

### percentage features on which i train data out of total features

In [100]:
X_train.shape[1]/df.shape[1]*100

14.473684210526317

### creating model and train data

In [101]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestRegressor()

In [102]:
model.fit(X_train,y_train)

RandomForestRegressor()

### train accuracy

In [103]:
model.score(X_train,y_train)

0.9757298499168607

### test accuracy

In [104]:
model.score(X_test,y_test)

0.8855622267034551

### MSE and RMSE

In [105]:
from sklearn.metrics import mean_squared_error
import numpy as np
predictions = model.predict(X_test)
mse = mean_squared_error(y_test,predictions)
rmse = np.sqrt(mse)
print(mse)
print(rmse)

877775051.6086271
29627.268716650666


### cheking predictions on X_test and comparing difference

In [106]:
import numpy as np
y_pred=model.predict(X_test.iloc[:5])
y_pred=np.round(y_pred)
y_true=list(y_test.iloc[:5])
diff=y_true-y_pred
print(y_pred)
print(y_true)
print(diff)

[141388. 312905. 113272. 164360. 318567.]
[154500, 325000, 115000, 159000, 315500]
[13112. 12095.  1728. -5360. -3067.]


### saving model using pickle

In [107]:
import pickle
pickle.dump(model, open('Random_forest_regressor.pkl','wb'))

### loading model for predictions

In [108]:
model_ = pickle.load(open('Random_forest_regressor.pkl','rb'))

In [109]:
X_test

Unnamed: 0,PConc,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd
893,0,6,1068,1,264,1059,1068,1,6,1963,2003
1106,1,8,2622,2,712,1463,1500,2,9,1994,1995
414,0,5,1028,2,360,1008,1028,1,5,1927,1950
523,0,6,1664,2,420,1004,1004,2,7,1947,1950
1037,1,9,1620,3,912,1620,1620,2,6,2007,2008
...,...,...,...,...,...,...,...,...,...,...,...
480,0,4,1131,2,672,907,1131,1,7,1937,2000
1362,1,7,1530,2,430,1530,1530,2,7,2005,2005
803,1,7,1456,2,410,728,728,2,7,2005,2005
652,0,4,1510,1,296,755,755,1,7,1940,1950


In [112]:
print(model_.predict([[0,6,1072,2,264,1059,1068,2,5,1963,2003]]))

[140474.5]


In [113]:
(model.predict(X_test))[0]

141387.75