In [32]:
#read the data
import pandas as ps
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import math
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import wordpunct_tokenize
import re
import numpy as np

# nltk.download()


In [33]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

train = ps.read_csv(train_path)
test = ps.read_csv(test_path)

test.head()

Unnamed: 0,id,tweet,state,location
0,4,Edinburgh peeps is it sunny?? #weather,,birmingham
1,5,"SEEVERE T’STORM WARNING FOR TROUSDALE, NORTHW...",,Nashville
2,7,@Agilis1 sport or traditional climbing? Thats ...,,Midwest
3,8,#WEATHER: 10:53 am : 63.0F. Feels 61F. 30.07% ...,tennessee,"Nashville, TN, USA"
4,12,We used to use umbrellas to face the bad weath...,,Houston


In [34]:
### use twokenized data
twokenized_train = ps.read_fwf('train_tokenized.txt',header=None)
train['tweet'] = twokenized_train[0]
twokenized_test = []
twokenized_test = ps.read_fwf('ark-tweet-nlp-0.3.2/test_tokenized.txt',header=None)
test['tweet'] = twokenized_test[0]

#initialize stemmer
stemmer = SnowballStemmer('english')
### remove stopwords
### don't know if this is useful when using ngrams
ps.options.mode.chained_assignment = None
stop_words = set(stopwords.words('english'))
stop_words.add("rt")
stop_words.add("{")
stop_words.add("}")
stop_words.add("link")
stop_words.add("google")
stop_words.add("facebook")
stop_words.add("twitter")
for i in range(len(train['tweet'])):
    train['tweet'][i] = [x.lower() for x in train['tweet'][i].split() if not x in stop_words] # to lower case
    train['tweet'][i] = " ".join(train['tweet'][i])
    train['tweet'][i] = re.sub(r'(\w)\1{2,}', r'\1\1',train['tweet'][i])
    train['tweet'][i] = stemmer.stem(train['tweet'][i])

In [40]:
print(train['tweet'][100:110])
print(stemmer.stem(train['tweet'][99]))

100              freakin killin audio ustream ! :/ freez
101    it actually feel like memphis weather may ... ...
102    and storms adore spring . need storm chasing @...
103                          currently 73 degrees 7:45 .
104    @mention good morning hope ur great day beauti...
105    houston weather retire to texas one of the che...
106    lool r u serious ?? confused chick rt @mention...
107    ... cooler temperatures and unsettled weather ...
108    i've checked weather forecast & still says bla...
109    #weather : 4:50 pm : 55.0 f . feels 52f . 29.6...
Name: tweet, dtype: object
how hell weather change like ?


In [35]:
tfidf = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 7))
tfidf.fit(train['tweet'])

# tfidf.get_feature_names()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [36]:
X = tfidf.transform(train['tweet'])
y = np.array(train.iloc[:,4:])
X_test = tfidf.transform(test['tweet'].values.astype('U')) 

X_train, X_val, y_train, y_val = train_test_split(X, y)

print(y_train)

[[0.    0.    0.    ... 0.164 0.    0.   ]
 [0.195 0.    0.398 ... 0.401 0.    0.   ]
 [0.    0.805 0.195 ... 0.    0.    0.   ]
 ...
 [0.221 0.    0.    ... 0.    0.    0.   ]
 [0.    0.403 0.    ... 0.    0.    0.   ]
 [0.    1.    0.    ... 0.385 0.    0.   ]]


# Random Forest

In [47]:
clf = RandomForestRegressor()

param_grid = {'max_depth': [None], 'min_samples_split' :[3, 10], 'min_samples_leaf' : [3, 10],
              'criterion':['mse']}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=4)

In [48]:
clf.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.510582705260425, total=  40.4s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.6s remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.514413202634155, total=  40.1s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.5191648494986347, total=  35.7s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.5204157201319333, total=  36.5s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.49900043369778674, total=  19.6s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.5025388783991691, total=  19.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.5004914024755611, total=  19.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.5029997648783718, total=  18.9s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None], 'min_samples_split': [3, 10], 'min_samples_leaf': [3, 10], 'criterion': ['mse']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [49]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [50]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.16860184158151342

# XGBoost

In [None]:
from sklearn.multioutput import MultiOutputRegressor
!pip install xgboost
from xgboost import XGBRegressor

In [None]:
multioutputregressor = MultiOutputRegressor(XGBRegressor(objective='reg:linear')).fit(X_train, y_train)

In [None]:
mor_pred = multioutputregressor.predict(X_val)
# Truncate
mor_pred[mor_pred > 1] = 1
mor_pred[mor_pred < 0] = 0

In [None]:
rmse = math.sqrt(mean_squared_error(y_val, mor_pred))
rmse # 0.16388598799139095

# Ridge Regression

In [51]:
clf = Ridge()

param_grid = {'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=1)

In [52]:
clf.fit(X_train, y_train)

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  8.4min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [53]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [54]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.15631172322618572

# Ensembling

In [41]:
clf_ridge = Ridge()
param_grid = {'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]}
clf_ridge = GridSearchCV(clf_ridge, param_grid=param_grid, cv=2, verbose=1)

clf_RF = RandomForestRegressor()
param_grid = {'max_depth': [None], 'min_samples_split' :[3, 10], 'min_samples_leaf' : [3, 10],
              'criterion':['mse']}
clf_RF = GridSearchCV(clf_RF, param_grid=param_grid, cv=2, verbose=4)


In [87]:
# Split dataset for S, W and K classifications.
y_train_S = y_train[:,0:5]
y_train_W = y_train[:, 5:9]
y_train_K = y_train[:, 9:23]
print(y_train[-1])

[0.    1.    0.    0.    0.    0.594 0.    0.406 0.    0.419 0.    0.
 0.    0.    0.    0.196 0.    0.    0.    0.    0.    0.385 0.    0.   ]


In [140]:
y_test_pred_ridge = np.zeros((19487,24))
y_test_pred_RF = np.zeros((19487,24))
print(y_test_pred_ridge[0,9:24])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [141]:
# Predict Sentiment
clf_ridge.fit(X_train, y_train_S)
print("ridge fitted")
clf_RF.fit(X_train, y_train_S)
print("RF fitted")
y_pred_S_ridge = clf_ridge.predict(X_val)
y_pred_S_RF = clf_RF.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_S_ridge[y_pred_S_ridge < 0] = 0
y_pred_S_ridge[y_pred_S_ridge > 1] = 1
y_pred_S_RF[y_pred_S_RF < 0] = 0
y_pred_S_RF[y_pred_S_RF > 1] = 1
print("done")
y_test_pred_S_ridge = clf_ridge.predict(X_test)
y_test_pred_S_RF = clf_RF.predict(X_test)

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  2.0min finished


ridge fitted
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.4059736857908517, total=  40.4s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.5s remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.40371817204256816, total=  40.6s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.4131521410958855, total=  36.5s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.0min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.40752861539955976, total=  36.2s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.4041527695166736, total=  19.9s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.3983572113004251, total=  20.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.39770622017789775, total=  20.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.39855598214238186, total=  20.1s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min finished


RF fitted
done


In [142]:
# Predict When
clf_ridge.fit(X_train, y_train_W)
print("ridge fitted")
clf_RF.fit(X_train, y_train_W)
print("RF fitted")
y_pred_W_ridge = clf_ridge.predict(X_val)
y_pred_W_RF = clf_RF.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_W_ridge[y_pred_W_ridge < 0] = 0
y_pred_W_ridge[y_pred_W_ridge > 1] = 1
y_pred_W_RF[y_pred_W_RF < 0] = 0
y_pred_W_RF[y_pred_W_RF > 1] = 1
print("done")
y_test_pred_W_ridge = clf_ridge.predict(X_test)
y_test_pred_W_RF = clf_RF.predict(X_test)

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  1.8min finished


ridge fitted
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.1843111848323681, total=  58.1s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   58.2s remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.18953630222562487, total=  56.8s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.19723814668748896, total=  53.1s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.8min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.1931316261868162, total=  52.3s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.2066584019314611, total=  22.6s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.20928723746345243, total=  20.4s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.2044910598429621, total=  20.8s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.20796112356711063, total=  20.1s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.1min finished


RF fitted
done


In [143]:
# Predict Kind
clf_ridge.fit(X_train, y_train_K)
print("ridge fitted")
clf_RF.fit(X_train, y_train_K)
print("RF fitted")
y_pred_K_ridge = clf_ridge.predict(X_val)
y_pred_K_RF = clf_RF.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_K_ridge[y_pred_K_ridge < 0] = 0
y_pred_K_ridge[y_pred_K_ridge > 1] = 1
y_pred_K_RF[y_pred_K_RF < 0] = 0
y_pred_K_RF[y_pred_K_RF > 1] = 1
print("done")
y_test_pred_K_ridge = clf_ridge.predict(X_test)
y_test_pred_K_RF = clf_RF.predict(X_test)

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  5.9min finished


ridge fitted
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.7518378598511988, total= 1.1min
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.7537868818743102, total= 1.2min
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.4min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.7540695115553611, total= 1.1min
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.4min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.7549869580651929, total= 1.1min
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.7343576615901192, total=  26.4s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.7321948030353026, total=  27.2s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.7343894128169036, total=  29.4s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.7320210712161411, total=  27.5s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.4min finished


RF fitted
done


In [147]:
# Ensemble
y_pred_ridge = np.zeros((19487,24))
y_pred_RF = np.zeros((19487,24))
y_test_pred_ridge = np.zeros((42157,24))
y_test_pred_RF = np.zeros((42157,24))

y_pred_ridge[:,0:5] = y_pred_S_ridge[:]
y_pred_ridge[:,5:9] = y_pred_W_ridge[:]
y_pred_ridge[:,9:23] = y_pred_K_ridge[:]

y_pred_RF[:,0:5] = y_pred_S_RF[:]
y_pred_RF[:,5:9] = y_pred_W_RF[:]
y_pred_RF[:,9:23] = y_pred_K_RF[:]

y_test_pred_ridge[:,0:5] = y_test_pred_S_ridge
y_test_pred_ridge[:,0:5] = y_test_pred_S_RF
y_test_pred_ridge[:,5:9] = y_test_pred_W_ridge
y_test_pred_RF[:,5:9] = y_test_pred_W_RF
y_test_pred_RF[:,9:23] =  y_test_pred_K_ridge
y_test_pred_RF[:,9:23] = y_test_pred_K_RF

In [138]:
y_pred_ridge[:,23:24]

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [148]:
rmse_ridge = math.sqrt(mean_squared_error(y_val, y_pred_ridge))
rmse_RF= math.sqrt(mean_squared_error(y_val, y_pred_RF))

rmse_ridge_S = math.sqrt(mean_squared_error(y_val[:,0:5], y_pred_S_ridge))
rmse_RF_S = math.sqrt(mean_squared_error(y_val[:,0:5], y_pred_S_RF))
rmse_ridge_W = math.sqrt(mean_squared_error(y_val[:,5:9], y_pred_W_ridge))
rmse_RF_W = math.sqrt(mean_squared_error(y_val[:,5:9], y_pred_W_RF))
rmse_ridge_K = math.sqrt(mean_squared_error(y_val[:,9:23], y_pred_K_ridge))
rmse_RF_K = math.sqrt(mean_squared_error(y_val[:,9:23], y_pred_K_RF))

print(rmse_ridge_S)
print(rmse_RF_S)
print(rmse_ridge_W)
print(rmse_RF_W)
print(rmse_ridge_K)
print(rmse_RF_K)
print(rmse_ridge)
print(rmse_RF)

0.20409017681571628
0.22804290305238648
0.21265023445598658
0.22818285555257786
0.11707918030017124
0.11087441911914263
0.16100137725380315
0.16850520227523094


In [150]:
#Test trunc
y_test_pred_RF[y_test_pred_RF < 0] = 0
y_test_pred_RF[y_test_pred_RF > 1] = 1
y_test_pred_ridge[y_test_pred_ridge < 0] = 0
y_test_pred_ridge[y_test_pred_ridge > 1] = 1
# Ensemble best test
ensemble_test_pred = np.zeros((42157,24))
ensemble_test_pred[:,0:9] = y_test_pred_ridge[:,0:9]
ensemble_test_pred[:,9:24] = y_test_pred_RF[:,9:24]



In [125]:
#don't run, override
#y_test_pred_ridge = np.zeros((19487,24))
#y_test_pred_RF = np.zeros((19487,24))

#y_test_pred_ridge[:,0:5] = y_pred_S_ridge[:]
#y_test_pred_ridge[:,5:9] = y_pred_W_ridge[:]
#y_test_pred_ridge[:,9:23] = y_pred_K_ridge[:]

#y_test_pred_RF[:,0:5] = y_pred_S_RF[:]
#y_test_pred_RF[:,5:9] = y_pred_W_RF[:]
#y_test_pred_RF[:,9:23] = y_pred_K_RF[:]

In [151]:
#ridge
prediction = np.array(np.hstack([np.matrix(test['id']).T, y_test_pred_ridge])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output_ridge.txt', prediction,col, delimiter=',')
#RF
prediction = np.array(np.hstack([np.matrix(test['id']).T, y_test_pred_RF])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output_RF.txt', prediction,col, delimiter=',')
#Ensemble
prediction = np.array(np.hstack([np.matrix(test['id']).T, ensemble_test_pred])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output_ensemble.txt', prediction,col, delimiter=',')

# Test Submission

In [122]:
test_prediction = clf.predict(X_test)
# Truncate predictions between 0 and 1
test_prediction[test_prediction < 0] = 0
test_prediction[test_prediction > 1] = 1

NameError: name 'clf' is not defined

In [16]:
prediction = np.array(np.hstack([np.matrix(test['id']).T, test_prediction])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output.txt', prediction,col, delimiter=',')

In [17]:
print('Train error: {0}'.format(np.sqrt(np.sum(np.array(np.array(clf.predict(X))-y)**2)/ (X.shape[0]*24.0))))



Train error: 0.14432603441695394
