In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import boxcox, inv_boxcox

In [49]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [50]:
train_df.head()

In [51]:
test_df.head()

In [52]:
len(train_df)

In [53]:
len(test_df)

In [54]:
target_feature = 'SalePrice'

In [55]:
df = pd.concat([train_df.drop(target_feature, axis=1), test_df])

df.head()

In [56]:
len(df)

In [57]:
train_df.info()

## Check Missing Values

In [58]:
import missingno
missingno.bar(df.iloc[:, : 40])

In [59]:
missingno.bar(df.iloc[:, 40 : 80])

In [60]:
train_df.isna().sum().sort_values(ascending=False) * 100 / len(train_df)

In [61]:
test_df.isna().sum().sort_values(ascending=False) * 100 / len(train_df)

## Analyzing Target Feature

In [62]:
sns.distplot(train_df[target_feature])

### Since this feature is hghly skewed and for Linear Regression skewed data is not good
### transform it into a scale so skewness is close to 0

In [63]:
train_df[target_feature].skew()

### Boxcox == 0  meaning log

In [64]:
target_boxcox_val = 0
boxcox(train_df[target_feature], target_boxcox_val).skew()

In [65]:
train_df[target_feature] = boxcox(train_df[target_feature], target_boxcox_val)

In [66]:
sns.distplot(train_df[target_feature])

###  Id 

In [67]:
df['Id'].nunique()

In [68]:
len(df)

## All the id values are unique so we need to drop it because it won't provide any necessary info to the model

## MSSubClass

In [69]:
df['MSSubClass'].isnull().sum()

In [70]:
train_df['MSSubClass'].value_counts()

In [71]:
test_df['MSSubClass'].value_counts()

In [72]:
df['MSSubClass'].median()

In [73]:
sns.distplot(df['MSSubClass'])

In [74]:
subclass_median = df['MSSubClass'].median()
subclass_median

In [75]:
def remove_subclass_outliers(x):
    if x > 100:
        x = subclass_median
    elif x < 20:
        x = subclass_median
    return x

In [76]:
train_df.select_dtypes(include = ['object'])

In [77]:
# train_df['MSSubClass'] = train_df['MSSubClass'].apply(remove_subclass_outliers)
# test_df['MSSubClass'] = test_df['MSSubClass'].apply(remove_subclass_outliers)
# df['MSSubClass'] = df['MSSubClass'].apply(remove_subclass_outliers)

## SaleCondition

In [78]:
df['SaleCondition'].isnull().sum()

In [79]:
df['SaleCondition'].value_counts()

In [80]:
train_df['SaleCondition'].value_counts()

In [81]:
test_df['SaleCondition'].value_counts()

In [82]:
sns.stripplot(x='SaleCondition', y=target_feature, data=train_df)

## BaseLine Model

### Ignore ID col and take all remaining numerical data only 

### Drop Missing data cols from the dataset

In [83]:
baseline_train_df = train_df.copy()
baseline_test_df = test_df.copy()
baseline_df = df.copy()

### Drop Missing Value Features

In [84]:
baseline_df.dropna(inplace=True, axis=1)
baseline_train_df.dropna(inplace=True, axis=1)
baseline_test_df.dropna(inplace=True, axis=1)

In [85]:
baseline_df.columns

In [86]:
set(baseline_test_df.columns) == set(baseline_train_df.columns)

In [87]:
set(baseline_train_df.columns) - set(baseline_test_df.columns)

In [88]:
commmon_features = list(set(baseline_test_df.columns).intersection(set(baseline_train_df.columns)))

commmon_features

In [89]:
baseline_train_df.head()

In [90]:
baseline_test_df.head()

In [91]:
baseline_train_df = baseline_train_df[commmon_features]
baseline_test_df = baseline_test_df[commmon_features]
baseline_df = baseline_df[commmon_features]

In [92]:
common_categorical = baseline_df.select_dtypes(include=['object']).columns

In [93]:
from sklearn.preprocessing import LabelEncoder

for col in common_categorical:
    le = LabelEncoder()
    le.fit(baseline_df[col])
    baseline_train_df[col] = le.transform(baseline_train_df[col])
    baseline_test_df[col] = le.transform(baseline_test_df[col])

In [94]:
baseline_train_df.select_dtypes(include='object').columns

In [95]:
baseline_test_df.select_dtypes(include='object').columns

In [96]:
features_to_ignore =  ['Id']

baseline_train_df.drop(features_to_ignore, axis=1, inplace=True)
baseline_test_df.drop(features_to_ignore, axis=1, inplace=True)

In [97]:
baseline_train_df.shape

In [98]:
baseline_test_df.shape

In [99]:
baseline_train_df[target_feature] = train_df[target_feature]

In [100]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(baseline_train_df.drop(target_feature,axis=1), 
                                                   baseline_train_df[target_feature] , test_size=0.1, random_state=42)

In [101]:
X_train.shape

In [102]:
y_train.shape

In [103]:
X_test.shape

In [104]:
y_test.shape

## Base line model

In [105]:
from sklearn.linear_model import LinearRegression

baseline_model = LinearRegression().fit(X_train, y_train)

## Eval on Validation Set 

In [106]:
from sklearn.metrics import mean_squared_error

preds = baseline_model.predict(X_test)

np.log(mean_squared_error(inv_boxcox(y_test, target_boxcox_val), inv_boxcox(preds, target_boxcox_val))) 

##  EVAL ON Testing SET 

In [107]:
baseline_train_df.columns

In [108]:
baseline_test_df.columns

In [109]:
set(baseline_train_df.columns) - set(baseline_test_df.columns)

In [110]:
baseline_test_submission_df = pd.DataFrame()

baseline_test_submission_df['Id'] = test_df['Id']

baseline_test_submission_df['SalePrice'] = inv_boxcox(baseline_model.predict(baseline_test_df), target_boxcox_val)

baseline_test_submission_df.to_csv('final_submission.csv', index=False)

baseline_test_submission_df.head()