In [33]:
#read the data
import pandas as ps
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import math
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import wordpunct_tokenize
import re

# nltk.download()


In [2]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

train = ps.read_csv(train_path)
test = ps.read_csv(test_path)

test.head()

Unnamed: 0,id,tweet,state,location
0,4,Edinburgh peeps is it sunny?? #weather,,birmingham
1,5,"SEEVERE T’STORM WARNING FOR TROUSDALE, NORTHW...",,Nashville
2,7,@Agilis1 sport or traditional climbing? Thats ...,,Midwest
3,8,#WEATHER: 10:53 am : 63.0F. Feels 61F. 30.07% ...,tennessee,"Nashville, TN, USA"
4,12,We used to use umbrellas to face the bad weath...,,Houston


In [42]:
### use twokenized data
twokenized_train = ps.read_fwf('train_tokenized.txt',header=None)
train['tweet'] = twokenized_train[0]
twokenized_test = []
twokenized_test = ps.read_fwf('ark-tweet-nlp-0.3.2/test_tokenized.txt',header=None)
test['tweet'] = twokenized_test[0]

#initialize stemmer
stemmer = SnowballStemmer('english')
### remove stopwords
### don't know if this is useful when using ngrams
ps.options.mode.chained_assignment = None
stop_words = set(stopwords.words('english'))
stop_words.add("rt")
stop_words.add("{")
stop_words.add("}")
stop_words.add("link")
stop_words.add("google")
stop_words.add("facebook")
stop_words.add("twitter")
for i in range(len(train['tweet'])):
    train['tweet'][i] = [x.lower() for x in train['tweet'][i].split() if not x in stop_words] # to lower case
    train['tweet'][i] = " ".join(train['tweet'][i])
    train['tweet'][i] = re.sub(r'(\w)\1{2,}', r'\1\1',test)
    train['tweet'][i] = stemmer.stem(train['tweet'][i])

'disfunct'

In [43]:
print(train['tweet'][100])
print(stemmer.stem(train['tweet'][100]))

freakin killin audio ustream ! :/ freez
freakin killin audio ustream ! :/ freez


In [45]:
tfidf = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 7))
tfidf.fit(train['tweet'])

# tfidf.get_feature_names()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 7), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [46]:
X = tfidf.transform(train['tweet'])
y = np.array(train.iloc[:,4:])
X_test = tfidf.transform(test['tweet'].values.astype('U')) 

X_train, X_val, y_train, y_val = train_test_split(X, y)

print(y_train)

[[0.    0.    0.201 ... 0.    0.    0.   ]
 [0.    0.    0.205 ... 0.    0.    0.   ]
 [0.    0.2   0.    ... 1.    0.    0.   ]
 ...
 [0.    0.    0.    ... 0.    0.    0.   ]
 [0.327 0.172 0.    ... 0.    0.    0.   ]
 [0.222 0.385 0.207 ... 0.273 0.    0.   ]]


# LSA

In [None]:
# Apply LSA to training, validation and test data generated by TF-IDF to generate the new datasets.

def LSA(X_train, X_val, X_test):
    lsa = TruncatedSVD(n_components=100)
    lsa.fit(X_train, y_train)
    X_train = lsa.transform(X_train)
    X_val = lsa.transform(X_val)
    X_test = lsa.transform(X_test)

    return X_train, X_val, X_test

# LDA

In [None]:
# Apply LDA directly to the tweet data to generate training, validation and test data.

def LDA():
    tf_vectorizer = CountVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3))
    tf = tf_vectorizer.fit_transform(train['tweet'])
    lda = LatentDirichletAllocation(n_components = 200, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)   
    tf = lda.transform(tf)

    X_test = tf_vectorizer.transform(test['tweet'].values.astype('U'))
    X_test = lda.transform(X_test)

    X_train, X_val, y_train, y_val = train_test_split(tf, y)
    
    return X_train, X_val, y_train, y_val, X_test

# Random Forest

In [47]:
clf = RandomForestRegressor()

param_grid = {'max_depth': [None], 'min_samples_split' :[3, 10], 'min_samples_leaf' : [3, 10],
              'criterion':['mse']}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=4)

In [48]:
clf.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.510582705260425, total=  40.4s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.6s remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=3, score=0.514413202634155, total=  40.1s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.5191648494986347, total=  35.7s
[CV] criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  criterion=mse, max_depth=None, min_samples_leaf=3, min_samples_split=10, score=0.5204157201319333, total=  36.5s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.49900043369778674, total=  19.6s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=3, score=0.5025388783991691, total=  19.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.5004914024755611, total=  19.0s
[CV] criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10 
[CV]  criterion=mse, max_depth=None, min_samples_leaf=10, min_samples_split=10, score=0.5029997648783718, total=  18.9s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None], 'min_samples_split': [3, 10], 'min_samples_leaf': [3, 10], 'criterion': ['mse']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [49]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [50]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.16860184158151342

# XGBoost

In [None]:
from sklearn.multioutput import MultiOutputRegressor
!pip install xgboost
from xgboost import XGBRegressor

In [None]:
multioutputregressor = MultiOutputRegressor(XGBRegressor(objective='reg:linear')).fit(X_train, y_train)

In [None]:
mor_pred = multioutputregressor.predict(X_val)
# Truncate
mor_pred[mor_pred > 1] = 1
mor_pred[mor_pred < 0] = 0

In [None]:
rmse = math.sqrt(mean_squared_error(y_val, mor_pred))
rmse # 0.16388598799139095

# Ridge Regression

In [51]:
clf = Ridge()

param_grid = {'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=1)

In [52]:
clf.fit(X_train, y_train)

Fitting 2 folds for each of 13 candidates, totalling 26 fits


[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  8.4min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [53]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [54]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.15631172322618572

# Ensembling

In [None]:
# Split dataset for S, W and K classifications.
y_train_S = y_train[:,0:5]
y_train_W = y_train[:, 5:9]
y_train_K = y_train[:, 9:23]

In [None]:
# Predict Sentiment
clf.fit(X_train, y_train_S)
y_pred_S = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_S[y_pred_S < 0] = 0
y_pred_S[y_pred_S > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,0:5], y_pred_S))
# rmse

In [None]:
# Predict When
clf.fit(X_train, y_train_W)
y_pred_W = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_W[y_pred_W < 0] = 0
y_pred_W[y_pred_W > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,5:9], y_pred_W))
# rmse

In [None]:
# Predict Kind
clf.fit(X_train, y_train_K)
y_pred_K = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_K[y_pred_K < 0] = 0
y_pred_K[y_pred_K > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,9:23], y_pred_K))
# rmse

In [None]:
# Ensemble


# Test Submission

In [15]:
test_prediction = clf.predict(X_test)
# Truncate predictions between 0 and 1
test_prediction[test_prediction < 0] = 0
test_prediction[test_prediction > 1] = 1

In [16]:
prediction = np.array(np.hstack([np.matrix(test['id']).T, test_prediction])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output.txt', prediction,col, delimiter=',')

In [17]:
print('Train error: {0}'.format(np.sqrt(np.sum(np.array(np.array(clf.predict(X))-y)**2)/ (X.shape[0]*24.0))))



Train error: 0.14432603441695394
