In [None]:
!git init

### Import the necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error  
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

#### Load dataset

In [None]:
train_data = pd.read_csv('train.csv', index_col = 'Id')
test_data = pd.read_csv('test.csv', index_col = 'Id')
train_data.head()

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

### About the dataset

The train data consist of 80 columns and 1460 rows. The rows represent different houses for while the columns reresents different features of these houses. Some of these features are explained below.

MSSubClass: Identifies the type of dwelling involved in the sale.

MSZoning: Identifies the general zoning classification of the sale.

LotArea: Lot size in square feet

LotShape: General shape of property

Street: Type of road access to property

Utilities: Type of utilities available

OverallCond: Rates the overall condition of the house

OverallQual: Rates the overall material and finish of the house

YearBuilt: Original construction date

YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)

MasVnrType: Masonry veneer type

MasVnrArea: Masonry veneer area in square feet

ExterQual: Evaluates the quality of the material on the exterior

ExterCond: Evaluates the present condition of the material on the exterior

BsmtQual: Evaluates the height of the basement

BsmtCond: Evaluates the general condition of the basement

KitchenQual: Kitchen quality

GarageYrBlt: Year garage was built

BldgType: Type of dwelling

In [None]:
!git status

In [None]:
!git add .

In [None]:
!git commit -m "load datasets"

#### data assessment and exploration

In [None]:
train_data.info()

In [None]:
#check columns with missing values
train_data.isnull().sum()[train_data.isnull().sum() > 0]

In [None]:
#Drop all columns with more than 81 missing values
#81 so as to include details about the garage
missing_cols_to_drop = [col for col in train_data.columns if train_data[col].isnull().sum() > 81]
train_data = train_data.drop(missing_cols_to_drop, axis = 1)
train_data.isnull().sum()[train_data.isnull().sum() > 0]

In [None]:
temp_num_cols = [col for col in train_data.columns if train_data[col].dtypes in ['int64', 'float64']]

#Remove columns object columns with too many unique entries (high dimentionality)
temp_obj_cols = [col for col in train_data.columns if train_data[col].dtypes == 'object' and train_data[col].nunique() < 6]
train_data[temp_obj_cols].nunique()

In [None]:
#It is possible the entries in train and test data differ in some categorical columns
non_uniform_cat_cols = []

for col in temp_obj_cols:
    for i in train_data[col].unique():
        if isinstance(i, str) and i not in test_data[col].unique():
            non_uniform_cat_cols.append(col)
non_uniform_cat_cols

In [None]:
#update temp_obj_cols
obj_cols = [e for e in temp_obj_cols if e not in non_uniform_cat_cols]

In [None]:
train_data.describe()

some columns have as high as the 3rd quartile as 0. The mean of many values are greatly affected by extreme values. The lot area for example, has 75% of its distribution below 11,700 but a mean of of 10,516 and standard deviation of 9981.

##### SalePrice

In [None]:
plt.figure(figsize = [15,6])
plt.subplot(1,2,1)
plt.hist(data = train_data, x = 'SalePrice', bins = 50)
plt.xlabel('SalePrice')
plt.ylabel('Distribution Proportion')

plt.subplot(1,2,2)
sns.swarmplot(y = train_data['SalePrice']);

The SalePrice distribution is right skewed. The modal house price is between 130,000 and 160,000. Majority of the SalePrice ranges between 100,000 to 220,000.

#### LotArea

In [None]:
plt.figure(figsize = [15,10])
plt.subplot(2,2,1)
plt.hist(data = train_data, x = 'LotArea', bins = 50)
plt.xlabel('LotArea')
plt.ylabel('Distribution')

plt.subplot(2,2,2)
sns.violinplot(y = train_data['LotArea'])

plt.subplot(2,2,3)
plt.hist(data = train_data, x = 'LotArea', bins = 80)
plt.xlim(0, 80000)
plt.xlabel('LotArea')
plt.ylabel('Distribution')


plt.subplot(2,2,4)
sns.violinplot(y = train_data['LotArea'])
plt.ylim(-10000,70000);

The LotArea distribution is right skewed. Modal LotArea ranges between 8,000 and 12,000 and just about 4 houses have LotAreas greater than 100000. Majority of LotAreas are below 30,000

In [None]:
#those 4 houses are outliers. remove outliers in LotArea
train_data = train_data[train_data.LotArea < 100000]

##### remodel year and saleprice

In [None]:
plt.figure(figsize = [15,6])
sns.pointplot(data = train_data, x = 'YearRemodAdd', y = 'SalePrice', ci = None)
plt.xticks(rotation = 90);

There is an increase in SalePrice as the Remodel date of the houses increase.

#### Building types and MSZoning

In [None]:
plt.figure(figsize = [12,6])
plt.subplot(1,2,1)
sns.countplot(data = train_data, x = 'BldgType')

plt.subplot(1,2,2)
sns.barplot(data = train_data, x = 'BldgType', y = 'SalePrice');

In [None]:
sns.countplot(data = train_data, x = 'MSZoning')

g = sns.FacetGrid(data = train_data, hue = 'MSZoning', height = 6)
g.map(plt.scatter, 'LotArea', 'SalePrice')
g.add_legend();

RH - Residential High Density;
C - Commercial;
FV - Floating Village Residential;
RL - Residential Low Density;
RM - Residential Medium Density


There are more houses in the Residential low density areas  and they have bigger sizes and cost more. Generally, SalePrice increase as LotArea increase

In [None]:
!git add .

In [None]:
!git commit -m "data wrangling"