* [MLCource](https://mlcourse.ai/)

In [None]:
import numpy as np
import pandas as pd

import missingno
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import scipy as sp
from scipy.stats import skew

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv("train.csv")
train_df.head()

In [None]:
train_df[train_df.duplicated()]

In [None]:
train_missing = train_df.isna().sum()

train_missing = 100 * (train_missing[train_missing > 0] / len(train_df))
train_missing

In [None]:
# Load the test dataset
test_df = pd.read_csv("test.csv")
test_df.head()

In [None]:
test_df[test_df.duplicated()]

In [None]:
test_missing = test_df.isna().sum()

test_missing = 100 * (test_missing[test_missing > 0] / len(test_df))
test_missing

Check target column first

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
sns.distplot(train_df['SalePrice'], bins=30, kde=True, ax=ax)

In [None]:
# One way of doing it
fig, ax = plt.subplots(figsize=(10,4))
sns.distplot(np.log1p(train_df['SalePrice']), bins=30, kde=True, ax=ax);
# Perform log transformation 
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

In [None]:
train_df['SalePrice'].isna().sum()

In [None]:
# Concatenate train/test datasets
df = pd.concat([train_df, test_df], axis=0)

Numerical Features

In [None]:
# Change these features into object type
change_type = ['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold']

for col in change_type:
    df[col] = df[col].astype("object")

In [None]:
# Describe numeric columns
df.drop("Id", axis=1).describe(include=['number']).T

In [None]:
num_feat = [x for x in df.columns if df[x].dtype !="object"]

num_feat.remove("Id")

In [None]:
# Correlation between numerical variables
corr_matrix = df[num_feat].corr()
plt.figure(figsize=(16,12))
sns.heatmap(corr_matrix.T, annot=True, cbar=False, cmap='coolwarm');

In [None]:
# Correlated variables greater than 0.8
corr_matrix = df[num_feat].corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr_matrix.T, annot=True, mask= corr_matrix < 0.8 ,cbar=False, cmap='coolwarm');

In [None]:
# Let's check how these correlated variables to each other are correlated to the target column, 
# so I can decide which of them remove from further analysis.
price_corr_ser = df[num_feat].corr()['SalePrice']
price_corr_ser = price_corr_ser.sort_values(ascending=False)
price_corr_ser = price_corr_ser.drop("SalePrice")

fig, ax = plt.subplots(figsize=(10,12))
sns.barplot(x=price_corr_ser.values, y=price_corr_ser.index, palette="rocket_r")
plt.title("Numeric Feature Correlation with Traget Column");

In [None]:
# Remove one of the highly correlated variables
high_correlated_var = ["GarageArea",'1stFlrSF','TotRmsAbvGrd']
df = df.drop(high_correlated_var, axis=1)

# Remove it from list of numeric columns
for c in high_correlated_var:
    num_feat.remove(c)

In [None]:
# Plot distribution of numeric variables
fig = plt.figure(figsize=(20,20))

for i in range(len(num_feat)):
    plt.subplot(14,5, i+1)
    sns.distplot(df[num_feat[i]], rug=True, hist=False, kde_kws={'bw':0.1})
    plt.title(num_feat[i])
    plt.xlabel("Value")
    plt.ylabel("Count")
    plt.tight_layout()
fig.show()

In [None]:
# Visualize relation between numeric features and target column
fig = plt.figure(figsize=(20,20))
# numeric_df = num_df.drop('SalePrice', axis=1)

for i, col in enumerate(df[num_feat].columns):
    plt.subplot(12,5, i+1)
    sns.scatterplot(x=df[col], y=df['SalePrice'])
    plt.tight_layout()
    
fig.show()

In [None]:
# Numerical outliers
fig = plt.figure(figsize=(24,15))

plt.subplot(4,3,1)
sns.distplot(df["LotArea"])

plt.subplot(4,3,2)
sns.scatterplot(x="LotArea", y="SalePrice", data=df)

In [None]:
df["LotArea"].describe()

In [None]:
# Create binary column 1 if the house has a pool, 0 if not
df['isPool'] = df['PoolArea'].apply(lambda x: 0 if x == 0 else 1)
df['isPool'] = df['isPool'].astype("object")
df = df.drop('PoolArea',axis=1)
num_feat.remove("PoolArea")

In [None]:
# create a new column where I concatenate all Porch columns
porch_col = ['OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch']

df['totalPorch'] = np.zeros(len(df)).reshape(len(df),1)

for col in porch_col:
    df['totalPorch'] += df[col]
    
# Remove porch col from dataset
for c in porch_col:
    df.drop(c, axis=1, inplace=True)

# Remove it from the list of numerical columns
to_remove = ["OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]
for c in to_remove:
    num_feat.remove(c)

# Add column to the list of numeric
num_feat.append("totalPorch")

In [None]:
# Create new columns and drop relevant ones
df["TotBathAbvGrade"] = df["FullBath"] + (0.5 * df["HalfBath"])
df["TotBsmtBath"] = df["BsmtFullBath"] + (0.5 * df["BsmtHalfBath"])

# Remove columns
to_remove = ["FullBath","HalfBath","BsmtFullBath", "BsmtHalfBath"]

for col in to_remove:
    df.drop(col, axis=1, inplace=True)
    num_feat.remove(col)

# Append new ones to the numeric columns
num_feat.append("TotBathAbvGrade")
num_feat.append("TotBsmtBath")

In [None]:
# Remove useless numerical column
df.drop("LotFrontage", axis=1, inplace=True)
num_feat.remove("LotFrontage")

In [None]:
# Create a plot again
fig = plt.figure(figsize=(15,15))

for i, col in enumerate(num_feat):
    plt.subplot(12,5, i+1)
    sns.scatterplot(x=df[col], y='SalePrice', data=df)
    plt.tight_layout()

fig.show()

In [None]:
# Show missing values
missingno.matrix(df[num_feat], figsize=(20,4))

In [None]:
# Remove SalePrice temporary
num_feat.remove("SalePrice")

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(df[num_feat])
df[num_feat] = imp.transform(df[num_feat])

In [None]:
for col in df[num_feat]:
    df[col] = df[col].apply(lambda x: np.log1p(x))
    
# Append SalePrice back to numeric columns
num_feat.append("SalePrice")

Categorical features

In [None]:
# List of categorical columns
cat_feat = [x for x in df.columns if df[x].dtype == "object"]

# Create a multi plot with categorical features
fig = plt.figure(figsize=(18, 30))

for i , col in enumerate(cat_feat):
    plt.subplot(12,5, i+1)
    sns.boxplot(x=col, y='SalePrice', data=df)
    plt.ylabel("Log() SalePrice")
    plt.tight_layout()
    
fig.show()

In [None]:
cat_missing = df[cat_feat].isna().sum()

cat_missing = 100 * (cat_missing[cat_missing > 0] / len(df[cat_feat]))

plt.figure(figsize=(10,5))
sns.barplot(x= cat_missing.sort_values(ascending=False).values, y= cat_missing.sort_values(ascending=False).index)
plt.title("Missing Categorical Values in %");

In [None]:
# Fill missing values in categorical columns with a string
for col in cat_feat:
    if df[col].isna().sum() > 0:
        df[col] = df[col].fillna(value="NA")
    else:
        continue

In [None]:
missingno.matrix(df[cat_feat], figsize=(20,4))

In [None]:
# for col in cat_missing.columns:
    # print(f" Column '{col}' has unique values {df[col].unique()}")

In [None]:
cat_missing = df[cat_feat].isna().sum()

cat_missing = 100 * (cat_missing[cat_missing > 0] / len(df[cat_feat]))

cat_missing

### Feature engineering for categorical variables

In [None]:
# Creating new series 
is_garage = df['GarageYrBlt'].apply(lambda x: 1 if x != "NA" else 0)

# Plot new series
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))

sns.countplot(is_garage, ax=axes[0])
sns.boxplot(x=is_garage.values, y='SalePrice', data=df, ax=axes[1])

axes[0].set_xlabel("Is Garage")
axes[0].set_ylabel("SalePrice ")

axes[1].set_xlabel("Is Garage")
axes[1].set_ylabel("SalePrice ");

In [None]:
to_remove = []
# Add to the list of columns to remove
to_remove.append("GarageYrBlt")

# Create new column from GaragYrBlt
df['isGarage'] = is_garage.astype('object')
cat_feat.append("isGarage")

**YearRemodAdd column**

In [None]:
df['YearRemodAdd'].unique()
to_remove.append("YearRemodAdd")

I don't see any value from this column, therefore I will drop it later on.

**YearBuilt & YrSold columns**

In [None]:
# Create a series of how old a house was when sold
how_old = (df['YrSold'].astype(int) - df['YearBuilt'].astype(int))

# New column from 
df['Old_in_Years'] = pd.Series(how_old)
# Update to numertic list
num_feat.append("Old_in_Years")

# Add columns for remove
to_remove.append('YrSold')
to_remove.append('YearBuilt')

In [None]:
submission = pd.DataFrame({
        "PassengerId": tsd["PassengerId"],
        "Survived": y_pred_random_forest
    })

submission.to_csv('price_predict.csv', index=False)

In [None]:
y_pred_random_forest.shape

In [None]:
submission