In [1]:
# improves quality of graphic outputs if you're on a Macbook with Retina display
%config InlineBackend.figure_format = 'retina'

# can display multiple things without call to print()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# interactive plots
# %matplotlib notebook
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import utils

  from pandas.core import datetools


In [14]:
train = pd.read_csv('data/train.csv', index_col='ID')
constcols_train = list(train.loc[:,train.apply(lambda i: len(i.unique()) == 1)].columns)
train = train[list(set(train.columns) - set(constcols_train))]
test = pd.read_csv('data/test.csv', index_col='ID')
test = test[list(set(test.columns) - set(constcols_train))]
tot = pd.concat([train, test]).sort_index()
enumcols = list(train.select_dtypes(include=['object']).columns)
bincols = list(train.select_dtypes(include=['int64']).columns)

In [27]:
from statsmodels.tsa.stattools import adfuller
tmp = train.loc[train['y']<train['y'].max(),'y']
print('WITH OUTLIER \n ADF Statistic: {r[0]}, p-val: {r[1]}'.format(r=adfuller(tmp)))
tmp.agg(['mean','std','skew','kurt'])
sm.OLS(tmp, sm.add_constant(tmp.index)).fit().summary()

tmp2 = tmp.pct_change().dropna()
print('WITHOUT OUTLIER \n ADF Statistic: {r[0]}, p-val: {r[1]}'.format(r=adfuller(tmp2)))
tmp2.agg(['mean','std','skew','kurt'])
sm.OLS(tmp2, sm.add_constant(tmp2.index)).fit().summary()

del tmp, temp2

WITH OUTLIER 
 ADF Statistic: -44.8751711040339, p-val: 0.0


mean    100.630190
std      12.424146
skew      0.738671
kurt      1.505205
Name: y, dtype: float64

0,1,2,3
Dep. Variable:,y,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,11.89
Date:,"Wed, 31 May 2017",Prob (F-statistic):,0.000568
Time:,22:50:40,Log-Likelihood:,-16567.0
No. Observations:,4208,AIC:,33140.0
Df Residuals:,4206,BIC:,33150.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,101.7688,0.382,266.733,0.000,101.021,102.517
x1,-0.0003,7.85e-05,-3.449,0.001,-0.000,-0.000

0,1,2,3
Omnibus:,446.373,Durbin-Watson:,2.194
Prob(Omnibus):,0.0,Jarque-Bera (JB):,765.206
Skew:,0.734,Prob(JB):,6.88e-167
Kurtosis:,4.486,Cond. No.,9700.0


WITHOUT OUTLIER 
 ADF Statistic: -19.452668872736226, p-val: 0.0


mean    0.016098
std     0.183344
skew    0.559753
kurt    0.792236
Name: y, dtype: float64

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.1012
Date:,"Wed, 31 May 2017",Prob (F-statistic):,0.75
Time:,22:50:40,Log-Likelihood:,1167.8
No. Observations:,4207,AIC:,-2332.0
Df Residuals:,4205,BIC:,-2319.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0177,0.006,3.129,0.002,0.007,0.029
x1,-3.69e-07,1.16e-06,-0.318,0.750,-2.64e-06,1.91e-06

0,1,2,3
Omnibus:,251.061,Durbin-Watson:,3.12
Prob(Omnibus):,0.0,Jarque-Bera (JB):,327.691
Skew:,0.559,Prob(JB):,6.96e-72
Kurtosis:,3.788,Cond. No.,9700.0


NameError: name 'temp2' is not defined

In [29]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

RS = 42

x1, x2, y1, y2 = train_test_split(pd.concat([pd.get_dummies(train[enumcols]), train[bincols]], axis=1, verify_integrity=True), train['y'], test_size=0.2, random_state=RS)

clfs = {
    'RFR': [RandomForestRegressor(random_state=RS), {
        'n_estimators': [10, 100, 1000],
        'min_samples_split': [2, 20, 200],
    }],
    'ABR': [AdaBoostRegressor(random_state=RS), {
        'n_estimators': [5, 50, 500],
        'learning_rate': [0.1, 0.5, 1.],
    }],
    'GBR': [GradientBoostingRegressor(random_state=RS), {
        'loss': ['ls', 'lad', 'huber', 'quantile'],
        'n_estimators': [10, 100, 1000],
        'learning_rate': [0.1, 0.5, 1.],
    }],
    'MLP': [MLPRegressor(random_state=RS), {
        'hidden_layer_sizes': [(5,5,2), (350, 250, 100)]
    }]
}

In [None]:
results = {name: utils.classify(params,x1=x1,y1=y1,x2=x2,y2=y2) for (name, params) in clfs.items()}

In [30]:
from sklearn.svm import SVR
svr = SVR(kernel='linear').fit(x1, y1)

In [31]:
pred = svr.predict(x2)

In [32]:
from sklearn.metrics import r2_score
r2_score(y2, pred)

0.55676837895382736