In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
# import missingno as mns (pip install missingno)

In [22]:
# linear regression deals with continous data 
df = pd.read_csv('house_data.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [23]:
# TO CREATE A TABLE SHOWING THE MISSING VALUES AND ITS PERCENTAGE USING CONCATENATION

missing_column_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum()/ len(df)) * 100
total_missing_values = pd.concat([missing_column_values, missing_columns_per], axis = 1, keys = ['missing Values', 'Percentage'])
total_missing_values = total_missing_values.sort_values('Percentage', ascending= False)
total_missing_values.head(20)

Unnamed: 0,missing Values,Percentage
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
MasVnrType,872,59.726027
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageYrBlt,81,5.547945
GarageCond,81,5.547945
GarageType,81,5.547945


In [24]:
# WE DROP THE FIRST FOUR COLUMNS 
df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis= 1, inplace= True)

In [25]:
# TO FILL UP THE MISSING VALUES USING LOOP 
numerical_data = df.select_dtypes(include = ['int', 'float'])
categorical_data = df.select_dtypes(include = ['object', 'category'])
for x in numerical_data:
    df[x].fillna(np.mean(df[x]), inplace = True) # TO FILLUP THE NUMERICL DATA WITHT THE MEAN USING LOOP
    
for x in categorical_data:
    df[x].fillna(df[x].mode()[0], inplace = True) # TO FILL UP THE STRAIN VARIABLE USING THE MODE

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(np.mean(df[x]), inplace = True) # TO FILLUP THE NUMERICL DATA WITHT THE MEAN USING LOOP
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0], inplace = True) # TO FILL UP THE STRAIN VARIABLE USING THE MODE


In [26]:
missing_column_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum()/ len(df)) * 100
total_missing_values = pd.concat([missing_column_values, missing_columns_per], axis = 1, keys = ['missing Values', 'Percentage'])
total_missing_values = total_missing_values.sort_values('Percentage', ascending= False)
total_missing_values.head(20)

Unnamed: 0,missing Values,Percentage
Id,0,0.0
HalfBath,0,0.0
FireplaceQu,0,0.0
Fireplaces,0,0.0
Functional,0,0.0
TotRmsAbvGrd,0,0.0
KitchenQual,0,0.0
KitchenAbvGr,0,0.0
BedroomAbvGr,0,0.0
FullBath,0,0.0


In [27]:
# FIRST PROCESS OF MACHINE LEARNUNG(analysus to predict the future using analysis) IS CALLED PREPROCESSING
# MACHINE LEARNING DOES NOT UNDERSTAND CHARACTERS BUT ONLY UNDERSTANDS BINSRY NUMBERS WHICH IS CONVERTED USING THE PRWPROCESSING
# LABEL ENCODER WILL PICK THE UNIQUE VALUES AND THEN LABEL THEM WITH NUMBERS AND THE IT REPLACES THE ALPHABETS WITH THE RESPECTIVE NUMBERS
# using linear encoder:
encoder = LabelEncoder()
categorical_data = df.select_dtypes(include = ['object', 'category'])
for x in categorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,3,3,0,2,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,0,3,0,4,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,0,3,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,0,3,0,2,...,0,0,0,0,0,12,2008,8,4,250000


In [28]:
# Next is to seperate our data to X and Y
# irrelevant columns in machine learning are: 1. Date 2. Name 3. Address 4. Desciption 5. ID
# seperating the dependent variables from the independent variable (x is the independent variable and y if the depedent variable )
x = df.drop(['Id', 'SalePrice'], axis = 1)
y = df['SalePrice']

In [29]:
# next: spliT X INTO TWO PARTS WHILE Y IS ALSO SPLITED TO TWO WHICH IN TOTAL IS FOUR 
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size = 0.15)
# the xtest will have the 15% and the xtrain will have 85% and the Ytest will have the 15% and the ytrain will have the 85%

In [None]:
# THINGS TO LEARN:  STREAMLIT: THIS ENABLES US TO SET UP A DASHBOARD. HOW TO ARRANGE THE DASHBOARD
# next thing is to trend the model


In [30]:
# the linear regression will take the trend data and predict 
model1 = LinearRegression()
model1.fit(xtrain,ytrain)

In [31]:
# for the linesr regression we have to calculate the mean absolute error('the original mean - regression mean), mean squared error, r2 score which tell us the goodness of the linear regression model
pred1 = model1.predict(xtest)
print(mean_absolute_error(ytest,pred1))

18705.197970273093


In [32]:
print(mean_absolute_error(ytest,pred1))

18705.197970273093


In [33]:
print(mean_squared_error(ytest,pred1))

1281953418.4524312


In [34]:
print(np.sqrt(mean_squared_error(ytest,pred1)))

35804.377085105545


In [35]:
print(r2_score(ytest,pred1))

0.8353045868933244


In [36]:
# for decision tree regressord we remove the algorithm

model2 = RandomForestRegressor()
model2.fit(xtrain,ytrain)

In [37]:
pred2= model2.predict(xtest)
print(mean_absolute_error(ytest,pred2))
print(mean_absolute_error(ytest,pred2))
print(mean_squared_error(ytest,pred2))
print(np.sqrt(mean_squared_error(ytest,pred2)))
print(r2_score(ytest,pred2))


15957.387260273972
15957.387260273972
783041962.6439136
27982.886960496297
0.8994008536806388
