### Content
- [Problem Statement](#Problem-Statement)
- [Libraries Used](#Importing-Libraries)
- [Data Used](#Load-Data)
- [EDA](#EDA)
- [Prediction for Kaggle Submission](#Make-Prediction-Using-"Test.csv"-for-Kaggle-Submission)

### Problem Statement
<br>
New buyers of residential properties usually are unfamiliar if a residential is over-priced or under-priced. You are tasked to create a model that will help buyers to evaluate the reasonable selling price for residential properties so that they are not over-paying for them and know when a property is a good deal.

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score

### Functions Definition

In [8]:
def boxplot_count_subplot(x, feature_list, data, row, col, figure_size=(20, 15)):
    # This function plots boxplot and count bar graph for each feature as as subplots
    # Declaring and initialising variables
    fig, ax = plt.subplots(figsize=figure_size, nrows=2, ncols = 3)
    i = 0
    
    # Loop through feature_list to create subplot for each feature
    for feature in feature_list:
        sns.boxplot(x=x, y=feature, data=data, ax=ax[0,i], order=data[feature].value_counts().index)
        sns.barplot(x=data[feature].value_counts(),y=data[feature].value_counts().index, ax=ax[1, i])
        i+=1

#----------------------------------------------------------------------------------------------        
        
def corr_plots(data, heatmap_size=(8, 8), heatmap_title_fontsize = 16, pair_title_fontsize = 16):
    # Heatmap
    plt.figure(figsize=heatmap_size)
    pearson_corr = data.corr()
    mask = np.zeros_like(pearson_corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(pearson_corr, annot=True, mask=mask, vmax = 1, vmin = -1)
    plt.title('Correlation Heatmap', fontweight = 'bold', fontsize=heatmap_title_fontsize)
    
    # Plot Pairplot
    pair = sns.pairplot(data)
    pair.fig.suptitle('Pairplot Between Features', fontweight = 'bold', fontsize=pair_title_fontsize)
    pair.fig.subplots_adjust(top=0.9)

#----------------------------------------------------------------------------------------------
    
def scatter_plot_3_features(data, x, y, features_list, rows, cols, 
                        figsize=(12,12), t_fontsize=10, t_dist_top = 0.9, marker_size=10):
    '''
    data: DataFrame of the data
    x: String. Numeric common feature to be set as x axis of all subplots
    y: String. Numeric common feature to be set as x axis of all subplots
    Features: List of strings to be set as hue in scatterplot
    rows: No of rows
    cols: No of cols
    figsize: tuple (Width, Height)
    '''
    fig, ax = plt.subplots(figsize=figsize, nrows=rows, ncols=cols)
    for i in range(len(features_list)):
        if rows > 1 and cols > 1:
            sns.scatterplot(x=x, y=y, data=data, hue=features_list[i], ax=ax[int(i/cols),i%cols], s=marker_size)
            ax[int(i/cols),i%cols].set_title(f"{y} vs {x} By {features_list[i]}")
        else:
            sns.scatterplot(x=x, y=y, data=data, hue=features_list[i], ax=ax[i], s=marker_size)
            ax[i].set_title(f"{y} vs {x} By {features_list[i]}")
    plt.suptitle(f"{y} vs {x} By Various Features", fontweight='bold', fontsize=t_fontsize)
    fig.subplots_adjust(top=t_dist_top)

#----------------------------------------------------------------------------------------------
    
def scatter_plot_by_labels(data, x, y, feature, rows, cols,
                          figsize=(12,12), t_fontsize=10, t_dist_top = 0.9, marker_size=10, ci=95):
    fig, ax = plt.subplots(figsize = figsize, nrows = rows, ncols = cols, sharex=True, sharey=True, squeeze=False)
    for i in range(data[feature].nunique()):
        sns.regplot(x=x, y=y, data=data[data[feature]==data[feature].unique()[i]],
                    ax=ax[int(i/cols),i%cols], ci=ci, line_kws={'color':'red'},
                    scatter_kws={'s':marker_size})
        ax[int(i/cols),i%cols].set_title(data[feature].unique()[i])
    fig.suptitle(f'{y} vs {x} by {feature}', fontsize=t_fontsize, fontweight="bold")
    fig.subplots_adjust(top=t_dist_top)

#----------------------------------------------------------------------------------------------

def prePost_logtransform(data, x, y, feature, label, figsize=(12,12), t_fontsize=10,
                         t_dist_top = 0.9):
    fig, ax = plt.subplots(figsize=figsize, nrows=1, ncols=2, sharey=True)
    # Comparison of scatter plot before and after transformation of x 
    # Plot for Sale Price vs Lot Area - Before Transformation for feature = label
    sns.regplot(x=x, y=y, data=data[data[feature]==label], ax=ax[0])
    ax[0].set_title('Sale Price vs Lot Area - Before Transformation')
    ax[0].set_xticks(ax[0].get_xticks())
    ax[0].set_xticklabels(ax[0].get_xticks(), rotation=90)

    # Plot for Sale Price vs Lot Area - After Transformation for Neighborhood = Edwards
    sns.regplot(x=np.log(data[data[feature]==label][x]), y=data[data[feature]==label][y], ax=ax[1])
    ax[1].set_title('Sale Price vs Lot Area - After Transformation')

    # Set Main Title
    fig.suptitle(f'Effect of Transformation on Lot Area for {feature}: {label}', fontsize = t_fontsize, fontweight='bold')
    fig.subplots_adjust(top=t_dist_top)
    
    # Correlation Values
    temp = data[data[feature]==label].copy()
    new_x = x + '_log'
    temp[new_x] = np.log(temp[x])
    pearson_corr = temp[[x, new_x, y]].corr()
    print(pearson_corr)

#----------------------------------------------------------------------------------------------

def imputing_lot_frontage(data):
    temp = data[data['Lot Area'] < 30000][['Lot Area', 'Lot Frontage']]
    temp.dropna(inplace=True)

    # Train Test Split Data Set
    x_train, x_test, y_train, y_test = train_test_split(temp[['Lot Area']], temp['Lot Frontage'], train_size = 0.8, random_state=60)

    # Instantiate OLS Model
    lr = LinearRegression()
    lr.fit(x_train, y_train)

    # Get index for missing Lot Frontage data for Lot Area < 30000
    missing_index = data[(data['Lot Area'] < 30000) & (data['Lot Frontage'].isnull())]['Lot Frontage'].index

    # Imputing data and save into Lot Frontage_Imputed
    data['Lot Frontage_Imputed'] = data['Lot Frontage']
    for i in missing_index:
        data.loc[i, 'Lot Frontage_Imputed'] = lr.predict([[data.loc[i, 'Lot Area']]])
        
    return data 
    
#----------------------------------------------------------------------------------------------
    
def clean(data):
    data=data.copy()
    # Converting MS SubClass, Overall Qual, Overall Cond to string as it is categorical data
    data['MS SubClass'] = data['MS SubClass'].astype('str')
    data['Overall Qual'] = data['Overall Qual'].astype('str')
    data['Overall Cond'] = data['Overall Qual'].astype('str')
    
    # Replace all 'NaN' from object columns
    for column in data.columns:
        if data[column].dtype == 'O':
            if column in ['Mas Vnr Type', 'Misc Feature']:
                data[column].replace(np.nan, 'None', inplace=True)
            else:
                data[column].replace(np.nan, 'n.a', inplace=True)
    
    # New Column for 'Garage Yr Blt and Impute missing 'Garage Yr Blt' rows with 'Year Built' data
    data['Garage Yr Blt_Imputed'] = data['Garage Yr Blt'].fillna(value=data['Year Built'])
    
    # Imputing Lot Frontage based on Linear Regression Model using Lot Area
    data = imputing_lot_frontage(data)
               
    # Set 0 for rows with house style as 1story but 2nd level SF is not 0
    data.loc[(data['House Style'] == '1Story') & (data['2nd Flr SF'] > 0), '2nd Flr SF'] = 0
 
    return data

#----------------------------------------------------------------------------------------------

def prep(data):
    data=data.copy()
    # Engineering new numeric features
    data['Total SF'] = data['Total Bsmt SF'] + data['1st Flr SF'] + data['2nd Flr SF']
    data['Total SF**2'] = (data['Total Bsmt SF'] + data['1st Flr SF'] + data['2nd Flr SF'])**2
    data['No Of Bath'] = data['Bsmt Full Bath'] + (data['Bsmt Half Bath']/2) + data['Full Bath'] + (data['Half Bath']/2)
    data['Total Porch SF'] = data['Open Porch SF'] + data['Enclosed Porch'] + data['3Ssn Porch'] + data['Screen Porch']
    data['Amenities SF'] = data['Pool Area'] + data['Wood Deck SF'] + data['Garage Area']
    data['Age_Sold'] = data['Yr Sold'] - data['Year Built']
    data['Remod/Add Age_Sold'] = data['Yr Sold'] - data['Year Remod/Add']
    data['Gr Liv Area**2'] = data['Gr Liv Area']**2

    # Dropping Lot Frontage and Garage Yr Blt
    data.drop(columns = ['Lot Frontage', 'Garage Yr Blt'], inplace=True)
                                                         
    return data

#----------------------------------------------------------------------------------------------

def residual_plot(model, predictors, target, fig_title):
    plt.scatter(model.predict(predictors), target - model.predict(predictors))
    plt.axhline(y = 0, color='red')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residual (Target - Predicted)')
    plt.title(fig_title)

#----------------------------------------------------------------------------------------------

def output_result(model_type, features, Train_R2, Train_CV_R2, Test_R2, Train_RMSE, Train_CV_RMSE, Test_RMSE):
    results = pd.DataFrame([[model_type, str(features), Train_R2, Train_CV_R2, Test_R2, Train_RMSE, Train_CV_RMSE, Test_RMSE]])
    results.to_csv(r'..\datasets\results.csv', index=False, mode='a', header=False)

### Make Prediction Using "Test.csv" for Kaggle Submission

In [798]:
# Prep data
test_X = clean_prep(test)
test_X = test_X[model_features_list].copy()

NameError: name 'clean_prep' is not defined

In [None]:
# Checking for Null values
temp = test_X.isnull().sum()
for i in range(len(temp)):
    print(temp.index[i], ': ',  temp[i], ', Percentage of All Rows', ": ", temp[i]/test_X.shape[0], ' Type: ', test_X.dtypes[i])

In [None]:
#Fill missing row with mean
if 'Mas Vnr Area' in test_X.columns:
    test_X['Mas Vnr Area'] = test_X['Mas Vnr Area'].fillna(X_train['Mas Vnr Area'].mean())

# One Hot Encoding
test_X_OHE = pd.get_dummies(data=test_X, drop_first=False)

In [None]:
# Features in trained model but not in test data (after One Hot Encoding)
missing_feature_in_test = []
for feature in X_train.columns:
    if feature not in test_X_OHE.columns:
        missing_feature_in_test.append(feature)
        
# Features in trained model but not in test data (after One Hot Encoding)
missing_feature_in_train = []
for feature in test_X_OHE.columns:
    if feature not in X_train.columns:
        missing_feature_in_train.append(feature)

In [None]:
# Create new features which are in training data model but not in test data and assign 0 to them
for feature in missing_feature_in_test:
       test_X_OHE[feature] = 0

# Drop features in test data which is not in trained model
test_X_OHE.drop(columns=missing_feature_in_train, inplace=True)

# Scale using StandardScaler
test_X_OHE_scaled = ss.transform(test_X_OHE)

In [None]:
# Predict using test data using ridge regression
y_test_predict = np.log(ridge_model.predict(test_X_OHE_scaled))

# Put prediction results into dataframe with Id column
test_result = pd.DataFrame(y_test_predict, columns = ['SalePrice'])
test_result['Id'] = test['Id']

# Write to csv file
test_result.to_csv('..\datasets\kaggle_submission_ridge.csv', index = False, columns=['Id', 'SalePrice'])

In [None]:
# Predict using test data using lasso regression
y_test_predict = np.log(lasso_model.predict(test_X_OHE_scaled))

# Put prediction results into dataframe with Id column
test_result = pd.DataFrame(y_test_predict, columns = ['SalePrice'])
test_result['Id'] = test['Id']

# Write to csv file
test_result.to_csv('..\datasets\kaggle_submission_lasso.csv', index = False, columns=['Id', 'SalePrice'])

### Conclusion

### Recommendations