# import library

In [1]:
#import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import IPython
from IPython import display
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

#import machine learning model


#import stats
from scipy import stats

pd.set_option('display.max_columns', 500)

# read dataset

In [2]:
train_df = pd.read_csv('./data/train.csv', index_col = 0)
test_df = pd.read_csv('data/test.csv', index_col = 0)

ctrain = train_df.copy()
ctest = test_df.copy()

combine = pd.concat([ctrain, ctest])

In [3]:
cat_features = combine.dtypes[combine.dtypes == 'object'].index
num_features = combine.dtypes[combine.dtypes != 'object'].index

In [4]:
cat_extra = ['MSSubClass', 'OverallCond', 'OverallQual']
combine[cat_extra] = combine[cat_extra].astype('object')

In [5]:
cat_features = combine.dtypes[combine.dtypes == 'object'].index
num_features = combine.dtypes[combine.dtypes != 'object'].index

# Inpute missingness

In [6]:
# define function to impute missing value
def impute_missing(dfName):
    
    # impute Continous numeric Variables   
    dfName['GarageYrBlt'] = dfName['GarageYrBlt'].fillna(min(dfName['GarageYrBlt']))
    dfName['LotFrontage'] = dfName.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    # impute 0
    for col in num_features:
        dfName[col].fillna(0, inplace = True)
    
    dfName['Functional'] = dfName['Functional'].fillna('Typ')
    
   # impute categorical variable missing not at Random with the mode value  
    for col in ('Electrical','MSZoning' , 'Exterior1st', 'Exterior2nd','KitchenQual', 'SaleType'):
        dfName[col].fillna(dfName[col].mode()[0], inplace = True)
  
    # impute None
    for col in cat_features:
        dfName[col].fillna('None', inplace = True)

In [7]:
#impute missingness
impute_missing(combine)

In [8]:
np.sum(combine.isnull())[np.sum(combine.isnull())>0]

Series([], dtype: int64)

# get dummy

In [9]:
combine = pd.get_dummies(combine, drop_first=True)

# split dataframe

In [129]:
train = combine[combine.SalePrice!=0]
test = combine[combine.SalePrice==0].drop('SalePrice',1)

# outliers

In [11]:
#define function to detect outliers
def detect_outliers(df,n,cols):
    outliner_index = []
    #iterate columns
    for col in cols:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        #calculate outliers bound
        bound = 1.5 * IQR
        lower_bound = Q1 - bound
        upper_bound = Q3 + bound
        #get list of outliners
        index = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outliner_index.extend(index)
    outliner_index = Counter(outliner_index)
    multiple_outliners = list(k for k,v in outliner_index.items() if v > n)
    return multiple_outliners

In [12]:
#filter out rows contain 8 outliers 
outlier_index = detect_outliers(train, 8, num_features)

In [13]:
#drop outliers
train.drop(outlier_index, inplace=True)

In [14]:
outlier_index

[524, 1299]

In [15]:
print(train.shape)
print(test.shape)

(1458, 261)
(1459, 261)


# Feature engineering

# Machine Learning

In [49]:


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline, Pipeline

In [17]:
lasso = Lasso()
ridge = Ridge()
net = ElasticNet()

In [26]:
#split train data
X = train.drop('SalePrice', 1)
y = np.log(train.SalePrice)

In [124]:
#grid search

param_grid = [
  {'alpha': list(np.linspace(1e-10,100,10))}
 ]

cv_lasso = GridSearchCV(ridge, param_grid, cv=10, n_jobs=-1)
cv_lasso.fit(X,y)
print(cv_lasso.best_params_)
print(cv_lasso.best_score_)

{'alpha': 11.1111111112}
0.9174958867387569


In [127]:
ridge.set_params(alpha = 11.11111111)
ridge.fit(X,y)

Ridge(alpha=11.11111111, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

# Output

In [132]:
x = pd.concat([test.reset_index(), pd.Series(np.exp(ridge.predict(test))).rename('SalePrice')], axis=1)[['Id', 'SalePrice']].set_index('Id')
x.to_csv('third_try.csv')