In [29]:
#read the data
import pandas as ps
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD
import math



In [23]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

train = ps.read_csv(train_path)
test = ps.read_csv(test_path)

test.head()

Unnamed: 0,id,tweet,state,location
0,4,Edinburgh peeps is it sunny?? #weather,,birmingham
1,5,"SEEVERE T’STORM WARNING FOR TROUSDALE, NORTHW...",,Nashville
2,7,@Agilis1 sport or traditional climbing? Thats ...,,Midwest
3,8,#WEATHER: 10:53 am : 63.0F. Feels 61F. 30.07% ...,tennessee,"Nashville, TN, USA"
4,12,We used to use umbrellas to face the bad weath...,,Houston


In [24]:
### use twokenized data
twokenized_train = ps.read_fwf('train_tokenized.txt',header=None)
train['tweet'] = twokenized_train[0]
twokenized_test = []
twokenized_test = ps.read_fwf('ark-tweet-nlp-0.3.2/test_tokenized.txt',header=None)
test['tweet'] = twokenized_test[0]

In [34]:
svd = TruncatedSVD(n_components=24, n_iter=7, random_state=42)
svd.fit(X)
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)


[0.00239629 0.00607721 0.00585773 0.00516153 0.00530749 0.00455613
 0.00403561 0.00372277 0.00365302 0.0035049  0.00322836 0.00309209
 0.00304916 0.00288514 0.00286447 0.0027701  0.00268783 0.00258244
 0.00252332 0.00245024 0.00230378 0.0022814  0.00225264 0.00217269]
0.0814163503053813
[36.37564981 21.60272289 21.25680199 20.34973173 20.2001371  18.70758617
 17.67412347 16.92983807 16.75980384 16.40649894 15.74423752 15.41353904
 15.30417657 14.89687142 14.83510839 14.583515   14.37185378 14.08101022
 13.91968934 13.7169637  13.29977222 13.25120283 13.15284549 12.93629008]


In [26]:
tfidf = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3))
tfidf.fit(train['tweet'])

# tfidf.get_feature_names()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [32]:
X = tfidf.transform(train['tweet'])
y = np.array(train.iloc[:,4:])
X_test = tfidf.transform(test['tweet'].values.astype('U')) 

X_train, X_val, y_train, y_val = train_test_split(X, y)

print(X_test)

  (0, 9226)	0.12284467665531285
  (0, 7717)	0.49272607970403354
  (0, 7664)	0.23197017089862015
  (0, 6387)	0.5550614714805855
  (0, 4435)	0.41818562136462034
  (0, 4297)	0.17465513595134888
  (0, 4194)	0.37509243740383125
  (0, 4139)	0.18479402441684495
  (1, 9071)	0.2875785522726091
  (1, 9070)	0.24349207781313464
  (1, 7785)	0.44802610636675827
  (1, 7421)	0.4593065751948661
  (1, 5895)	0.4533523459358235
  (1, 3783)	0.35288594297213616
  (1, 2907)	0.14590223115220577
  (1, 1495)	0.3081817044271085
  (2, 9491)	0.23392016992769885
  (2, 9226)	0.06712521262014724
  (2, 8624)	0.183937243742933
  (2, 8469)	0.09034159681793655
  (2, 8040)	0.29738261923793863
  (2, 7909)	0.0732765577740799
  (2, 7908)	0.2610024177995806
  (2, 7901)	0.28344099758955604
  (2, 7869)	0.13890727335792646
  :	:
  (42154, 1074)	0.204861108145556
  (42154, 764)	0.18570914113525172
  (42154, 195)	0.127478883775513
  (42155, 9226)	0.17466838700620282
  (42155, 8748)	0.24919660397411442
  (42155, 7100)	0.33929116230

# Random Forest

In [6]:
clf = RandomForestRegressor()

param_grid = {'max_depth': [None], 'min_samples_split' :[3, 10], 'min_samples_leaf' : [3, 10],
              'criterion':['mse']}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=4)

In [7]:
clf.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.5293397477735104, total= 1.2min
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.5254830720937576, total= 1.3min
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.5342758554858101, total= 1.1min
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.6min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.5307195305996409, total= 1.0min
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.5137308011872684, total=  33.5s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.5140803302643739, total=  33.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.5169491409014524, total=  33.8s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.5124272792454402, total=  33.3s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.9min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None], 'min_samples_split': [3, 10], 'min_samples_leaf': [3, 10], 'criterion': ['mse']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [8]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [9]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.16728293590703416

# XGBoost

In [10]:
from sklearn.multioutput import MultiOutputRegressor
!pip install xgboost
from xgboost import XGBRegressor

Collecting xgboost
  Using cached https://files.pythonhosted.org/packages/83/3a/8570f4e8e19acd3a5a75abc920964182a4b64db2ee0f041fb77b48447c6b/xgboost-0.72.tar.gz


No files/directories in C:\Users\Jesse\AppData\Local\Temp\pip-install-jnybtsz_\xgboost\pip-egg-info (from PKG-INFO)


ModuleNotFoundError: No module named 'xgboost'

In [None]:
multioutputregressor = MultiOutputRegressor(XGBRegressor(objective='reg:linear')).fit(X_train, y_train)

In [None]:
mor_pred = multioutputregressor.predict(X_val)
# Truncate
mor_pred[mor_pred > 1] = 1
mor_pred[mor_pred < 0] = 0

In [None]:
rmse = math.sqrt(mean_squared_error(y_val, mor_pred))
rmse # 0.16388598799139095

# Ridge Regression

In [13]:
clf = Ridge()

param_grid = {'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=1)

In [14]:
clf.fit(X_train, y_train)

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed: 19.2min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [15]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [16]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.15363337043645406

# Ensembling

In [17]:
# Split dataset for S, W and K classifications.
y_train_S = y_train[:,0:5]
y_train_W = y_train[:, 5:9]
y_train_K = y_train[:, 9:23]

In [18]:
# Predict Sentiment
clf.fit(X_train, y_train_S)
y_pred_S = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_S[y_pred_S < 0] = 0
y_pred_S[y_pred_S > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,0:5], y_pred_S))
# rmse

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  4.4min finished


In [19]:
# Predict When
clf.fit(X_train, y_train_W)
y_pred_W = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_W[y_pred_W < 0] = 0
y_pred_W[y_pred_W > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,5:9], y_pred_W))
# rmse

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  4.2min finished


In [20]:
# Predict Kind
clf.fit(X_train, y_train_K)
y_pred_K = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_K[y_pred_K < 0] = 0
y_pred_K[y_pred_K > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,9:23], y_pred_K))
# rmse

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed: 24.9min finished


In [None]:
# Ensemble


Ensemble of 24 classifiers

In [12]:
# Split dataset for S, W and K classifications.
#for sentiment:https://www.ravikiranj.net/posts/2012/code/how-build-twitter-sentiment-analyzer/
#tf-idf and 
y_train_S1 = y_train[:,0]
y_train_S2 = y_train[:,1]
y_train_S3 = y_train[:,2]
y_train_S4 = y_train[:,3]
y_train_S5 = y_train[:,4]
y_train_W1 = y_train[:, 5]
y_train_W2 = y_train[:, 6]
y_train_W3 = y_train[:, 7]
y_train_W4 = y_train[:, 8]
y_train_K1 = y_train[:, 9]
y_train_K2 = y_train[:, 10]
y_train_K3 = y_train[:, 11]
y_train_K4 = y_train[:, 12]
y_train_K5 = y_train[:, 13]
y_train_K6 = y_train[:, 14]
y_train_K7 = y_train[:, 15]
y_train_K8 = y_train[:, 16]
y_train_K9 = y_train[:, 17]
y_train_K10 = y_train[:, 18]
y_train_K11 = y_train[:, 19]
y_train_K12 = y_train[:, 20]
y_train_K13 = y_train[:, 21]
y_train_K14 = y_train[:, 22]
y_train_K15 = y_train[:, 23]

In [None]:
#classifiers
clf1 = 
clf2 =
clf3 = 
clf4 = 
clf5 = 
clf6 = 
clf7 = 
clf8 = 
clf9 = 
clf10 = 
clf11 = 
clf12 = 
clf13 = 
clf14 = 
clf15 = 
clf16 = 
clf17 = 
clf18 = 
clf19 = 
clf20 = 
clf21 = 
clf22 = 
clf23 = 
clf24 = 
classifiers = [clf1,clf2,clf3,clf4,clf5,clf5,clf6,clf7,clf8,clf9,clf10,clf11,clf12,clf13,clf14,clf15,clf16,clf17,clf18,clf19,]classifiers = [clf1,clf2,clf3,clf4,clf5,clf5,clf6,clf7,clf8,clf9,clf10,clf11,clf12,clf13,clf14,clf15,clf16,clf17,clf18,clf19,]classifiers = [clf1,clf2,clf3,clf4,clf5,clf5,clf6,clf7,clf8,clf9,clf10,clf11,clf12,clf13,clf14,clf15,clf16,clf17,clf18,clf19,clf20,clf21,clf22,clf23,clf24]

# Test Submission

In [15]:
test_prediction = clf.predict(X_test)
# Truncate predictions between 0 and 1
test_prediction[test_prediction < 0] = 0
test_prediction[test_prediction > 1] = 1

In [16]:
prediction = np.array(np.hstack([np.matrix(test['id']).T, test_prediction])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output.txt', prediction,col, delimiter=',')

In [17]:
print('Train error: {0}'.format(np.sqrt(np.sum(np.array(np.array(clf.predict(X))-y)**2)/ (X.shape[0]*24.0))))



Train error: 0.14432603441695394
