<a href="https://colab.research.google.com/github/Karlajack/MachineLearning1/blob/main/housepoject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
## import the relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
import missingno as msno

In [6]:
# impor the data

train=pd.read_csv('/content/train.csv')
test=pd.read_csv('/content/test.csv')

train.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [7]:
# change the column names to lower case
train.columns=map(str.lower,train.columns)
# data.columns = [x.lower() for x in data.columns] #would also suffice
train.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Explore the missing values

In [8]:
# what is the number of missing values present
# train.shape
# train.columns[train.isnull().sum()>0] # columns with missing value present
# we could decide to do away with some which have a certain percent of missing values
# let us get this percentages
total_missing=train.isnull().sum().sort_values(ascending=False)
percent=train.isnull().sum().sort_values(ascending=False)/train.isnull().count().sort_values(ascending=False)
missing_df=pd.concat([total_missing,percent],axis=1,keys=['total_missing','percentage_missing'],sort=False)
missing_df.index.name='variable'# set the index name to variable
missing_df=missing_df[total_missing>0] # only retain those with missing values
missing_df['percentage_missing']=missing_df['percentage_missing'].map(lambda x:x*100) # convert to percentage
missing_df

Unnamed: 0_level_0,total_missing,percentage_missing
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
poolqc,1453,99.520548
miscfeature,1406,96.30137
alley,1369,93.767123
fence,1179,80.753425
masvnrtype,872,59.726027
fireplacequ,690,47.260274
lotfrontage,259,17.739726
garageyrblt,81,5.547945
garagecond,81,5.547945
garagetype,81,5.547945


Explore all the categorical variables

In [None]:
## get their names first
categorical=train.select_dtypes(include=[np.object])
categorical.head()

In [None]:
missing_vars=[x for x in missing_df.index if x in categorical.columns]

In [None]:
missing_df_resetted=missing_df.reset_index()
missing_df_cat=missing_df_resetted[missing_df_resetted['variable'].isin(categorical.columns)]

In [None]:
## plot now
plt.figure(figsize=(20,20))
ax=sns.catplot(y='variable',x='percentage_missing',data=missing_df_cat,kind='bar',color='red')

exploring numerical variables

In [None]:
numerical=train.select_dtypes(include=[np.number])
numerical.columns

In [None]:
numerical.head(4)

In [None]:
#missing values in numerical variables
missing_df_resetted[missing_df_resetted['variable'].isin(numerical.columns)]

In [None]:
handling missing values
If a variable is object type we want to impute with 'None'while for numerica type we want to impute with mean

In [None]:
categorical.fillna('None').head()

handling both categorical and numeric separately

In [None]:
# for var in numerical.columns:
#     numerical.fillna(numerical[var].mode()[0])
# numerical.isnull().sum()
for var in ['lotfrontage','garageyrblt','masvnrarea']:
    train[var]=train[var].fillna(train[var].mode()[0])

In [None]:
for var in ['alley','miscfeature','poolqc','fence','fireplacequ','garagecond','garagequal',
            'garagefinish','garagetype','bsmtfintype2','bsmtfintype1','bsmtqual','bsmtcond','bsmtexposure','masvnrtype','electrical']:
    train[var]=train[var].fillna('None')

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
# LotFrontage is correlated to the 'Neighborhood' feature because the LotFrontage for nearby houses will be really similar, so we fill in missing values by the median based off of Neighborhood
# dataset["LotFrontage"] = dataset.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

continous numerical variables

In [None]:
temporal_vars=[x for x in train.columns if 'yr' in x or  'year' in x]
temporal_vars

In [None]:
['yearbuilt', 'yearremodadd', 'garageyrblt', 'yrsold']

In [None]:
numerical.head()

In [None]:
discrete=[x for x in numerical.columns if len(train[x].unique())<25] # get all the discrete variables in the dataset
continous_numerical=[x for x in numerical.columns if x not in temporal_vars+["id"] and x not in discrete]
print("there are {} continous variables in this dataset".format(len(continous_numerical)))

there are 16 continous variables in this dataset

explore the relationship of these continous variables with selling price

In [None]:
train[continous_numerical].head()

explore the relationship of these continous variables with selling price

In [None]:
train[continous_numerical].corr()

In [None]:
corr_df=train[continous_numerical].corr()
corr_df.nlargest(5,'saleprice')

In [None]:
corr_df['saleprice'].sort_values(ascending=False)

In [None]:
fig,ax=plt.subplots(figsize=(14,14))
sns.heatmap(corr_df,annot=True,cmap='viridis')

In [None]:
# those with high correlation with saleprice
cols=corr_df.nlargest(10,'saleprice')['saleprice'].index
cols

In [None]:
corrmap=np.corrcoef(train[cols].values.T)
sns.heatmap(corrmap,annot=True,xticklabels=cols.values,yticklabels=cols.values)

In [None]:
explore the relationship between the discrete variables and selling price

In [None]:
train['mssubclass'].unique()

In [None]:
train[discrete].head()

In [None]:
def plot():
    for var in discrete:

        if not var=='mssubclass'or not var=='3ssnporch':
            train.groupby(var)['saleprice'].mean().plot.bar(color='tan')
            plt.ylabel('selling price of house')
            plt.title(var)
            plt.grid(True)
            plt.show()

In [None]:
plot()

how the temporal variables relate to selling price

In [None]:
temporal_vars

In [None]:
fig = plt.figure(figsize=(12,18))
def plt_temporal():
    for var in temporal_vars:
        if not var=='yrsold':
            plt.scatter(x=train[var],y=train['saleprice'],c='red')
            plt.ylabel('selling price')
            plt.title(var.upper())
            plt.show()
plt_temporal()

Outliers

In [None]:
# identify then plot
# univariate
# multivariate
# remove them
# do a box plots for the numerical vars
# fig = plt.figure(figsize=(16,20))
# for i in range(len(numerical.columns)):
#     fig.add_subplot(9, 4, i+1)
#     for var in numerical:
#         sns.boxplot(numerical[var])
# plt.show()
continous_df=numerical[continous_numerical]# get the contious variables data frame

fig=plt.figure(figsize=(16,20))
for var in range(len(continous_df.columns)):
    fig.add_subplot(9,5,var+1)
    sns.boxplot(x=continous_df.iloc[:,var],color="cyan")
plt.tight_layout()
plt.show()

It states that a data point is an outlier if:
It is below the First Quadrant (Q1) subtracted by (1.5 x IQR)
It is above the Third Quadrant (Q3) added by (1.5 x IQR)

In [None]:
## bivaraite outlier analysis
for var in continous_df:
    if not var=='id'and var!='saleprice':
        sns.scatterplot(x=continous_df[var],y=continous_df['saleprice'])
        plt.title(var.upper())
        plt.show()