In [None]:
import pandas as pd
import sklearn 
import numpy as np
from sklearn.model_selection import train_test_split

# Reading, understanding and modifying the Data

In [None]:
data=pd.read_csv("train.csv")
data.describe()

In [None]:
data.describe()


In [None]:
data.isna().sum()
# we have 259 na entries in LotArea

In [None]:
data["Alley"].dtype # --> object 
data["Alley"].isna().sum()
data["Alley"].mode()

In [None]:
data.head()

As we cannot delete the missing values we will have to perform imputation as the number of missing values is relatively very high

In the case of ```data["Alley"]``` the number of NaN values is very high and hence imputation will not yield fruitful results ```1369 out of 1460 ``` are NaN

In [None]:
data["Alley"].isna().sum()

In [None]:
data.drop(["Alley"],axis=1,inplace=True)

In [None]:
data["LotFrontage"].isna().sum()

We will fill the na values of ```data["LotFrontage"]``` by the mean of the column

In [None]:
data["LotFrontage"].mean()

In [None]:
data["LotFrontage"]=data["LotFrontage"].fillna(data["LotFrontage"].mean())

In [None]:
data["LotFrontage"].isna().sum()

In [None]:
data["MSSubClass"].isna().sum()
# data.head()
# data["MSSubClass"] has no NaN values and hence doesnt requires imputaion

In [None]:
data["MSZoning"].isna().sum()
# data["MSZoning"] has no NaN values and hence doesnt requires imputaion


In [None]:
data["LotArea"].isna().sum()
# data["LotArea"] has no NaN values and hence doesnt requires imputaion


In [None]:
data["Street"].isna().sum()

In [None]:
data["LotShape"].isna().sum()


In [None]:
data["LandContour"].isna().sum()


In [None]:
data["Utilities"].isna().sum()


In [None]:
data["LotConfig"].isna().sum()


In [None]:
data["LandSlope"].isna().sum()


In [None]:
data["Neighborhood"].isna().sum()


In [None]:
data["Condition2"].isna().sum(),data["Condition1"].isna().sum()


In [None]:
data["BldgType"].isna().sum(),data["HouseStyle"].isna().sum() 


In [None]:
data["OverallQual"].isna().sum(),data["OverallCond"].isna().sum(),data["YearBuilt"].isna().sum(),data["YearRemodAdd"].isna().sum(),data["RoofStyle"].isna().sum()

In [None]:
data["RoofMatl"].isna().sum(),data["Exterior1st"].isna().sum(),data["Exterior2nd"].isna().sum(),data["MasVnrArea"].isna().sum(),data["ExterQual"].isna().sum(),data["ExterCond"].isna().sum()

In [None]:
data["MasVnrArea"].isna().sum()

In [None]:
data["MasVnrArea"].mean()

In [None]:
# we will fill the nan values with the mean of the column
data["MasVnrArea"]=data["MasVnrArea"].fillna(data["MasVnrArea"].mean())

In [None]:
data["MasVnrArea"].isna().sum()


In [None]:
data["Foundation"].isna().sum(),data["BsmtQual"].isna().sum(),data["BsmtCond"].isna().sum(),data["BsmtExposure"].isna().sum(),data["BsmtFinType1"].isna().sum(),data["BsmtFinSF1"].isna().sum()


In [None]:
data["Foundation"].dtype,data["BsmtQual"].dtype,data["BsmtCond"].dtype,data["BsmtExposure"].dtype,data["BsmtFinType1"].dtype,data["BsmtFinSF1"].dtype


As all the NaN containing columns have a object dtype hence it can be concluded that these values must be replaced with the mode of the columns 

In [None]:
data["Foundation"].mode(),data["BsmtQual"].mode(),data["BsmtCond"].mode(),data["BsmtExposure"].mode(),data["BsmtFinType1"].mode(),data["BsmtFinSF1"].mode()


In [None]:
data["BsmtFinType1"]=data["BsmtFinType1"].fillna("Unf")

In [None]:
data["Foundation"].isna().sum(),data["BsmtQual"].isna().sum(),data["BsmtCond"].isna().sum(),data["BsmtExposure"].isna().sum(),data["BsmtFinType1"].isna().sum(),data["BsmtFinSF1"].isna().sum()


In [None]:
data["BsmtFinType1"].describe()

In [None]:
data["3SsnPorch"].isna().sum(),data["ScreenPorch"].isna().sum(),data["PoolArea"].isna().sum(),data["PoolQC"].isna().sum(),data["Fence"].isna().sum(),data["MiscFeature"].isna().sum()


In [None]:
data["GarageCond"].describe()


In [None]:
data["GarageCond"]=data["GarageCond"].fillna("TA")


In [None]:
data["FireplaceQu"].isna().sum()
# data["FireplaceQu"].describe()
# data["FireplaceQu"]=data["FireplaceQu"].fillna("Gd")

In [None]:
data.drop("PoolQC",axis=1,inplace=True)
data.drop("MiscFeature",axis=1,inplace=True)
data.drop("Fence",axis=1,inplace=True)



As ```data["PoolQC"] , data["Fence"] , data["MiscFeature"]```
have lots of missing values ```1453 , 1179 , 1406 out of 1460 ```
respectively hence they are dropped

In [None]:
data["MiscVal"].isna().sum(),data["MoSold"].isna().sum(),data["YrSold"].isna().sum(),data["SaleType"].isna().sum(),data["SaleCondition"].isna().sum(),data["SalePrice"].isna().sum()


Data Modelling is completed now we'll apply ML Model 

In [None]:
data.to_csv("datamod.csv")

# Applying ML Algorithms

For A Regreesion Problem we have basically 3 Ways to Implement it :

* Linear Regression 

* RandomForestRegressor

* HistGradientBoostinRegressor


In [None]:
x=data.drop("SalePrice",axis=1)
y=data["SalePrice"]


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
model1=LinearRegression()
model2=RandomForestRegressor()


Perfroming Data Preprocessing using ```onehotencoder```

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cf=["SaleCondition","SaleType","PavedDrive","GarageCond","GarageQual","GarageFinish","FireplaceQu","Functional","KitchenQual","Electrical","CentralAir","HeatingQC","Heating","BsmtFinType2","BsmtFinType1","BsmtExposure","BsmtCond","BsmtQual","Foundation","ExterCond","ExterQual","MasVnrType","Exterior2nd","Exterior1st","RoofMatl","RoofStyle","HouseStyle","BldgType","Condition2","Condition1","Neighborhood","LandSlope","LotConfig","Utilities","LandContour","LotShape","Street","MSZoning","GarageType"]

oh=OneHotEncoder(sparse=False)
# transformer=ColumnTransformer([])
transformer=ColumnTransformer([("1hot",oh,cf)],remainder="passthrough")
transformedx=transformer.fit_transform(x)
x=pd.DataFrame(transformedx)


In [None]:
np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(transformedx,y,test_size=0.2)
x.isna().sum()

# data["SalePrice"].isna().sum()

In [None]:
# xtrain.isna().sum()
# data.isna().sum()
# model1.fit(xtrain,ytrain)
np.any(np.isnan(x))
np.all(np.isfinite(x))
pd.set_option('use_inf_as_na',True)

In [None]:
# model2.fit(xtrain,ytrain)
# pd.x.dropna(axis=0)
x.replace([np.inf, -np.inf], np.nan, inplace=True)

dummies=pd.get_dummies(data[["SaleCondition","SaleType","PavedDrive","GarageCond","GarageQual","GarageFinish","FireplaceQu","Functional","KitchenQual","Electrical","CentralAir","HeatingQC","Heating","BsmtFinType2","BsmtFinType1","BsmtExposure","BsmtCond","BsmtQual","Foundation","ExterCond","ExterQual","MasVnrType","Exterior2nd","Exterior1st","RoofMatl","RoofStyle","HouseStyle","BldgType","Condition2","Condition1","Neighborhood","LandSlope","LotConfig","Utilities","LandContour","LotShape","Street","MSZoning","GarageType"]])
dummies

x=dummies
np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(transformedx,y,test_size=0.2)

model=RandomForestRegressor()
model.fit(xtrain,ytrain)


np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(transformedx,y,test_size=0.2)

In [None]:
np.any(np.isnan(x)),np.all(np.isfinite(x))


By the above findings it can be concluded that a value has been registered greater than the maximum storage allowed by the float  data type hence ```Input contains NaN ``` error is prompted 

In light of the above findings it is hereby concluded that ```GradientBoostingRegressor``` will be used to solve this problem 

# GradientBoostingRegressor

Why are we using it?

* This estimator is much faster than GradientBoostingRegressor for big datasets (n_samples >= 10 000). Ours is 1460

* This estimator can easily work with NaN values

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor


Here xtest and ytest are pseudo test sets of the train.csv. They are not to be confused by the test.csv

In [None]:
np.random.seed(42)
xtrain,xtest,ytrain,ytest=train_test_split(x,y)

In [None]:
model=HistGradientBoostingRegressor().fit(xtrain,ytrain)
# model.fit(xtrain,ytrain)

In [None]:
model.score(xtest,ytest)

In [None]:
ypreds=model.predict(xtest)

In [None]:
ypreds.shape

With default Hyper Tuning Parameters we obtained an accuracy of ``` 88.9 % ``` 

Fine Tuning Hyper Tuning Parameters of our model 

In [None]:
model=HistGradientBoostingRegressor(learning_rate=1).fit(xtrain,ytrain)
model.score(xtest,ytest)
# model.predict


By increasing the learning rate we end up decreasing the efficiency 

In [None]:
# model=HistGradientBoostingRegressor(max_iter=1000).fit(xtrain,ytrain)
# model.score(xtest,ytest)

A 1% increase is obtained after 1min 14.4 sec of computational time after increasing max_iter from 100 to 1000 hence we will stick to default parameters 

# Testin Accuracy Of Our Regression Model 

In [None]:
from sklearn.metrics import mean_squared_error as mse
model=HistGradientBoostingRegressor().fit(xtrain,ytrain)



In [None]:
ypreds=model.predict(xtest)

In [None]:
rmse=mse(ytest,ypreds)**0.5
rmse

In [None]:
ytest/ypreds


In [None]:
model.fit(x,y)

# Our model is now ready to make prediction on test data 

In [None]:
test=pd.read_csv("test.csv")

test.to_csv("testobt.csv")


from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cf=["SaleCondition","SaleType","PavedDrive","GarageCond","GarageQual","GarageFinish","FireplaceQu","Functional","KitchenQual","Electrical","CentralAir","HeatingQC","Heating","BsmtFinType2","BsmtFinType1","BsmtExposure","BsmtCond","BsmtQual","Foundation","ExterCond","ExterQual","MasVnrType","Exterior2nd","Exterior1st","RoofMatl","RoofStyle","HouseStyle","BldgType","Condition2","Condition1","Neighborhood","LandSlope","LotConfig","Utilities","LandContour","LotShape","Street","MSZoning","GarageType","Street","MiscFeature","Fence","PoolQC","GarageType","Alley"]
oh=OneHotEncoder(sparse=False)

transformer=ColumnTransformer([("1hot",oh,cf)],remainder="passthrough")
transformedx=transformer.fit_transform(test)
x=pd.DataFrame(transformedx)
x

In [None]:
# test.drop("PoolQC",axis=1,inplace=True)
# test.drop("MiscFeature",axis=1,inplace=True)
# test.drop("Fence",axis=1,inplace=True)
# test.drop("Alley",axis=1,inplace=True)


from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cf=["SaleCondition","SaleType","PavedDrive","GarageCond","GarageQual","GarageFinish","FireplaceQu","Functional","KitchenQual","Electrical","CentralAir","HeatingQC","Heating","BsmtFinType2","BsmtFinType1","BsmtExposure","BsmtCond","BsmtQual","Foundation","ExterCond","ExterQual","MasVnrType","Exterior2nd","Exterior1st","RoofMatl","RoofStyle","HouseStyle","BldgType","Condition2","Condition1","Neighborhood","LandSlope","LotConfig","Utilities","LandContour","LotShape","Street","MSZoning","GarageType"]

oh=OneHotEncoder(sparse=False)
# transformer=ColumnTransformer([])
transformer=ColumnTransformer([("1hot",oh,cf)],remainder="passthrough")
transformedx=transformer.fit_transform(test)
x=pd.DataFrame(transformedx)


In [802]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
x=test
# x.drop("PoolQC",axis=1,inplace=True)
# x.drop("MiscFeature",axis=1,inplace=True)
x.drop("Id",axis=1,inplace=True)

# x.drop("Alley",axis=1,inplace=True)
# x.drop("Fence",axis=1,inplace=True)


cf=["SaleCondition","SaleType","PavedDrive","GarageCond","GarageQual","GarageFinish","FireplaceQu","Functional","KitchenQual","Electrical","CentralAir","HeatingQC","Heating","BsmtFinType2","BsmtFinType1","BsmtExposure","BsmtCond","BsmtQual","Foundation","ExterCond","ExterQual","MasVnrType","Exterior2nd","Exterior1st","RoofMatl","RoofStyle","HouseStyle","BldgType","Condition2","Condition1","Neighborhood","LandSlope","LotConfig","Utilities","LandContour","LotShape","Street","MSZoning","GarageType","Street","GarageType"]
oh=OneHotEncoder(sparse=False)

transformer=ColumnTransformer([("1hot",oh,cf)],remainder="passthrough")
transformedx=transformer.fit_transform(test)
x=pd.DataFrame(transformedx)
x



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,276,277,278,279,280,281,282,283,284,285
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,730.0,140.0,0.0,0.0,0.0,120.0,0.0,0.0,6.0,2010.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,312.0,393.0,36.0,0.0,0.0,0.0,0.0,12500.0,6.0,2010.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,482.0,212.0,34.0,0.0,0.0,0.0,0.0,0.0,3.0,2010.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,470.0,360.0,36.0,0.0,0.0,0.0,0.0,0.0,6.0,2010.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,506.0,0.0,82.0,0.0,0.0,144.0,0.0,0.0,1.0,2010.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2006.0
1455,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,286.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,4.0,2006.0
1456,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,576.0,474.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2006.0
1457,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,80.0,32.0,0.0,0.0,0.0,0.0,700.0,7.0,2006.0


In [804]:
ypreds_actual =model.predict(x)

In [810]:
predicted=pd.DataFrame(ypreds_actual)

In [811]:
predicted.to_csv("preidictedbyhamid.csv")

In [812]:
ytest

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
988     195000
243     120000
1342    228500
1057    248000
1418    124000
Name: SalePrice, Length: 365, dtype: int64