# Import Library

In [6]:
#import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import IPython
from IPython import display
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

#import machine learning model


#import stats
from scipy import stats

pd.set_option('display.max_columns', 500)

# Read Files

In [7]:
train_df = pd.read_csv('./data/train.csv', index_col = 0)
test_df = pd.read_csv('data/test.csv', index_col = 0)

full_data = [train_df, test_df]

combine = pd.concat(full_data)
combine_c = combine.copy()

# Data Cleaning

In [8]:
# numeric variables 
cont_col = ['LotFrontage', 'LotArea',  'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
          'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
          'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
           'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
           'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
           'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
           'MoSold', 'GarageYrBlt', 'YrSold', 'YearBuilt','YearRemodAdd', 'OverallQual',
           'OverallCond']

# check missing in continous variable
check_cont_missing = np.sum(combine[cont_col].isnull())[np.sum(combine[cont_col].isnull())>0]
cont_impute_0 = list(set(check_cont_missing.index) - {'GarageYrBlt'} - {'LotFrontage'})

# categorical variable
cat_col = list(set(combine.columns)-set(cont_col)-{'SalePrice'})
cat_missing = list(np.sum(combine[cat_col].isnull())[np.sum(combine[cat_col].isnull())>0].index)

# define function to impute missing value
def impute_missing(dfName):
    
    # impute Continous numeric Variables   
    dfName['GarageYrBlt'] = dfName['GarageYrBlt'].fillna(min(dfName['GarageYrBlt']))
    dfName['LotFrontage'] = dfName.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    # impute 0
    for col in cont_impute_0:
        dfName[col].fillna(0, inplace = True)
    
    # impute categorical variable missing not at Random
    dfName['Electrical'] = dfName['Electrical'].fillna(dfName['Electrical'].mode()[0])
    
    cat_missing.remove('Electrical')
    # impute None
    for col in cat_missing:
        dfName[col].fillna('None', inplace = True)

In [9]:
#impute
impute_missing(combine_c)

In [10]:
#check na again
np.sum(combine_c.isnull())[np.sum(combine_c.isnull())>0]

SalePrice    1459
dtype: int64

# Outliers

In [11]:
#define function to detect outliers
def detect_outliers(df,n,cols):
    outliner_index = []
    #iterate columns
    for col in cols:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        #calculate outliers bound
        bound = 1.5 * IQR
        lower_bound = Q1 - bound
        upper_bound = Q3 + bound
        #get list of outliners
        index = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outliner_index.extend(index)
    outliner_index = Counter(outliner_index)
    multiple_outliners = list(k for k,v in outliner_index.items() if v > n)
    return multiple_outliners

In [12]:
outliers_index = detect_outliers(combine_c[combine_c.SalePrice.notnull()], 7, cont_col)
combine_c.drop(outliers_index, inplace=True)

# Dummify

In [13]:
#define get dummy function for all dummy variables
def get_dummy(df, cols):
    for col in cols:
        dummies = pd.get_dummies(df[col], prefix=col, dummy_na=False)
        df = df.drop(col, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [14]:
combine_d = get_dummy(combine_c, cat_col)

In [15]:
print(combine_d.shape)

(2916, 324)


In [16]:
dtrain = combine_d[combine_d.SalePrice.notnull()]
dtest = combine_d[combine_d.SalePrice.isnull()].drop(columns = 'SalePrice')

In [17]:
print(dtrain.shape)
print(dtest.shape)

(1457, 324)
(1459, 323)


# Model

In [18]:
import sklearn.model_selection as ms
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from math import sqrt
import sklearn.metrics

In [19]:
lasso = Lasso()
ridge = Ridge()
net = ElasticNet()

In [20]:
features = dtrain.drop('SalePrice', axis = 1)
price = dtrain.SalePrice

In [21]:
lasso.set_params(normalize = True)
lasso.fit(features, price)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

# Write Data

In [240]:
x = pd.concat([dtest.reset_index(), pd.Series(lasso.predict(dtest)).rename('SalePrice')], axis=1)[['Id', 'SalePrice']].set_index('Id')
x.to_csv('second_try.csv')

In [33]:
dtrain.shape

(1457, 324)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(features,price, test_size=0.33, random_state=42)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

from sklearn.preprocessing import Imputer

In [26]:
from sklearn.model_selection import cross_val_score,GridSearchCV, RandomizedSearchCV

In [48]:
ridge = Ridge()

In [55]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [60]:
ridge = Ridge()
lasso = Lasso()

lis_ = [ridge, lasso]

In [61]:
pipe = make_pipeline(ridge, lasso)

TypeError: All intermediate steps should be transformers and implement fit and transform. 'Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)' (type <class 'sklearn.linear_model.ridge.Ridge'>) doesn't

In [39]:
my_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_s...ators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [40]:
my_pipe.predict(X_test)

array([129100. , 184615.5, 235191. ,  91780. , 316458.5, 146475.2,
       176354. , 142075. , 138265. , 129575. , 136000. , 122687.5,
       247695. , 281756.1,  98520. ,  89300. , 201350. , 171573.5,
       184813.2, 209020. ,  78200. , 174200. , 175853.2, 138560. ,
       191150. , 164710. , 220750. , 384776. , 286528. , 144650. ,
       123286.4, 378358.7, 118560. , 111210. , 138560. , 142395. ,
       208480. , 146090. , 167006.5, 188640. , 130900. , 218388. ,
       101467.6, 208471. , 140930. , 117535. , 133050. , 125990. ,
       157545. , 132900. , 138190. , 238150. , 214000. , 182545. ,
       171052.1, 233808. , 199940. , 151690. , 147220. , 216057. ,
        79150. , 356225.6, 311621.9, 309478.9, 265935.4, 220800. ,
       251480. ,  80855. , 152190. , 174339. , 132440. , 141000. ,
       188190. , 143390. , 145150. , 172027.5, 248557.2, 170252.2,
       141240. , 121945. , 181790. , 127340. , 233090. , 209104. ,
       293150. , 393125.1, 186669.5, 181163.4, 128297.5, 14294