# 4. Modeling
## 4.1 Select Modeling Techniques
### Outputs:
- Modeling Technique
- Modeling Assumptions



In [320]:
from sklearn.linear_model import LogisticRegression
import nltk
import pandas as pd
import math
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import gridspec

from sklearn import datasets, linear_model
import numpy as np
from numbers import Number

from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.metrics as metrics

from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor, VotingClassifier

import warnings
warnings.filterwarnings('ignore')


In [321]:
train_dataframe= pd.read_csv("../data/train_dummied.csv")
train_dataframe=train_dataframe.set_index('Id')
y=np.array(train_dataframe.pop('SalePrice'))
X=np.array(train_dataframe)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

### Logistic Regression

In [322]:
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train, y_train)
y_pred_lr=lr.predict(X_test)

In [323]:
print ('Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_pred_lr,y_test)))
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error: 43524.2943244
Accuracy: 0.70349 (+/- 0.19)


###  Decision Tree Regression 

In [324]:
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)
y_pred_clf=clf.predict(X_test)

In [325]:
print ('Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_pred_clf,y_test)))
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error: 40798.2651954
Accuracy: 0.72007 (+/- 0.16)


## Ridge Linear Regression

In [326]:
ridge = linear_model.Ridge (alpha = .5)
ridge.fit (X_train,y_train)
y_pred_ridge = ridge.predict(X_test)

In [327]:
print ('Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_pred_ridge,y_test)))
scores = cross_val_score(ridge, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error: 23140.9730422
Accuracy: 0.89108 (+/- 0.03)


## Lasso Linear Regression

In [328]:
lasso = linear_model.Lasso(alpha = 1.5)
lasso = lasso.fit(X_train,y_train)
y_pred_lasso = lasso.predict(X_test)

In [329]:
print ('Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_pred_lasso,y_test)))
scores = cross_val_score(lasso, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error: 23863.1624051
Accuracy: 0.87599 (+/- 0.03)


## Elastic Net

In [330]:
ENST = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], 
                                    l1_ratio=[.01, .1, .5, .9, .99], 
                                    max_iter=5000)
ENST = ENST.fit(X_train, y_train)
y_pred_enst = ENST.predict(X_test)

In [331]:
print ('Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_pred_enst,y_test)))
scores = cross_val_score(ENST, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error: 21897.8597481
Accuracy: 0.90089 (+/- 0.04)


## Gradient Tree Boosting

In [332]:
gtb = GradientBoostingRegressor(n_estimators=100, 
                                learning_rate=0.1,
                                max_depth=1,
                                random_state=0, 
                                loss='ls')
gtb = gtb.fit(X_train,y_train)
y_pred_gtb = gtb.predict(X_test)

In [333]:
print ('Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_pred_gtb,y_test)))
scores = cross_val_score(gtb, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean Squared Error: 27132.0037243
Accuracy: 0.86205 (+/- 0.06)


## Voting Classifier

In [334]:
results=pd.DataFrame({'Ridge':y_pred_ridge,
                     'Lasso':y_pred_lasso,
                     'ENST':y_pred_enst,
                     'test':y_test})

In [335]:
stacker= linear_model.LinearRegression()
stacker.fit(results[['Ridge', 'Lasso', 'ENST']], results['test'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [336]:
scores = cross_val_score(stacker, results[['Ridge', 'Lasso', 'ENST']], results['test'], cv=5)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.90575 (+/- 0.13)


In [337]:
results['stacked']=stacker.predict(results[['Ridge', 'Lasso', 'ENST']])
results

Unnamed: 0,ENST,Lasso,Ridge,test,stacked
0,100824.810580,111435.968467,113765.913283,123000.0,107937.551568
1,265448.437574,270733.034297,271701.759842,275500.0,258636.086312
2,199444.072393,196495.658876,197255.883912,197900.0,194558.368837
3,152775.537635,156695.161925,155817.650843,145250.0,153011.726246
4,293375.581376,303238.683082,304919.050384,290000.0,286295.345893
5,153472.475914,172514.297367,171614.232599,155000.0,158659.036858
6,319958.119971,329360.604511,331192.575206,274000.0,310871.664607
7,234820.871338,223836.280750,214424.694296,235128.0,221733.381963
8,126377.899904,144728.336475,145065.800916,166000.0,133637.489339
9,95485.964372,102413.321145,102349.688877,86000.0,101051.041003


In [369]:
sub_dataframe= pd.read_csv("../data/test_dummied.csv")
sub_dataframe=sub_dataframe.set_index('Id')
del sub_dataframe['SalePrice']

X_sub=np.array(sub_dataframe)
ridge.predict(X_sub)

submission_dataset=pd.DataFrame({'lasso':lasso.predict(X_sub),
                                'ridge':ridge.predict(X_sub),
                                'ENST':ENST.predict(X_sub)})

predictions=stacker.predict(submission_dataset)


In [374]:
pred_df=pd.DataFrame({'SalePrice':predictions,
                      'Id':sub_dataframe.index})
pred_df=pred_df.set_index('Id')
pred_df.to_csv('../data/submission.csv')

In [376]:
sub_dataframe

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,-0.657390,-0.786163,-0.099206,-1.040209,0.108207,0.634970,-0.795568,-0.243216,-0.691967,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1462,0.483570,-0.786163,-0.099206,0.190439,1.136688,-0.290049,-0.795568,-0.243216,-0.385662,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1463,-0.573069,0.892848,-0.099206,0.190439,0.838316,-0.290049,-0.795568,-0.243216,-0.991516,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1464,-0.578339,0.837759,-0.099206,0.190439,0.411100,-0.290049,-0.795568,-0.243216,-0.570346,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1465,0.354455,-0.786163,-0.099206,-1.040209,-0.355175,-0.290049,-0.795568,-0.243216,0.990459,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1466,-1.007846,1.350325,-0.099206,0.190439,-0.949659,-0.290049,-0.795568,-0.243216,0.418389,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1468,-0.939336,0.832969,-0.099206,0.190439,-0.949659,-0.290049,-0.795568,-0.243216,0.476947,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1469,0.515191,-0.786163,-0.099206,-1.040209,0.490214,-0.290049,1.128931,-0.243216,0.193165,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1470,-0.694280,-0.786163,-0.099206,-1.040209,0.867701,0.211003,1.128931,-0.243216,-1.300073,-0.386120,...,0,0,0,0,0,0,0,1,0,1
1471,0.504650,-0.786163,-0.099206,-1.040209,1.426019,-0.290049,1.128931,-0.243216,-0.502779,-0.386120,...,0,0,0,0,0,0,0,1,0,1


## 4.2 Generate Test Design
### Outputs:
- Test Design

## 4.3 Build Model
### Outputs:
- Parameter Settings
- Models
- Model Descriptions

## 4.4 Assess Model
### Outputs:

- Model Assessment
- Revised Parameter
- Settings