In [42]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import cross_val_score,  train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor

In [26]:
def prepare_data():
    train = (pd.read_csv('../data/train.csv', na_values=-1)
           .fillna(value=999))
            
    test  = (pd.read_csv('../data/test.csv', na_values=-1)
               .fillna(value=999)) 
    test_id = test.id
    y = train.target.values
    train = train.drop(['id','target'], axis=1)
    
    
    features = train.columns
    features_bin = [i for i in features if set(i)&set('bin')==set('bin')]
    features_cat = [i for i in features if set(i)&set('cat')==set('cat')]
    features_con = list(set(features) - (set(features_bin)|set(features_cat)))
    
    
    train_cat = np.array(train[features_cat])
    train_cat[:,:] = np.add(train_cat[:,:],np.ones((train_cat.shape[0],train_cat.shape[1])))
    test_cat  = np.array(test[features_cat])
    test_cat[:,:] =  np.add(test_cat[:,:],np.ones((test_cat.shape[0],train_cat.shape[1])))

    OH = OneHotEncoder()
    OH.fit(np.array(list(train_cat) + list(test_cat)))
    train_cat = OH.transform(train_cat).toarray()
    test_cat = OH.transform(test_cat).toarray()

    train_con = np.array(train[features_con])
    test_con  = np.array(test[features_con])
    RS = StandardScaler()
    RS.fit(list(train_con) + list(test_con))
    train_con = RS.transform(train_con)
    train_con = RS.transform(train_con)

    train_bin = np.array(train[features_bin])
    test_bin = np.array(test[features_bin])


    X = np.hstack((train_cat,train_con,train_bin))

    X_test = np.hstack((test_cat,test_con,test_bin))
    
    return X, y, X_test, test_id

In [27]:
X, y, _, _ = prepare_data()

In [28]:
lr = LinearRegression()

In [29]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
scores = cross_val_score(estimator=lr, X=X, y=y, cv=5, n_jobs=-2, scoring='neg_mean_squared_error')
print('CV accuracy scores: %s' % scores); print()
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [-0.03512252 -0.03444842 -0.04563677 -0.03464486 -0.03491672]

CV accuracy: -0.037 +/- 0.004


In [33]:
lasso = Lasso(alpha=.5)
lasso.fit(X, y)

Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [34]:
scores = cross_val_score(estimator=lasso, X=X, y=y, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
print('CV accuracy scores: %s' % scores); print()
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [-0.03543986 -0.03474675 -0.03521423 -0.03496502 -0.03522981]

CV accuracy: -0.035 +/- 0.000


In [37]:
ridge = Ridge(alpha=10)
scores = cross_val_score(estimator=lasso, X=X, y=y, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')


In [38]:
print('CV accuracy scores: %s' % scores); print()
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [-0.03543986 -0.03474675 -0.03521423 -0.03496502 -0.03522981]

CV accuracy: -0.035 +/- 0.000


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)