In [1]:
#read the data
import pandas as ps
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import math



In [2]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

train = ps.read_csv(train_path)
test = ps.read_csv(test_path)

test.head()

Unnamed: 0,id,tweet,state,location
0,4,Edinburgh peeps is it sunny?? #weather,,birmingham
1,5,"SEEVERE T’STORM WARNING FOR TROUSDALE, NORTHW...",,Nashville
2,7,@Agilis1 sport or traditional climbing? Thats ...,,Midwest
3,8,#WEATHER: 10:53 am : 63.0F. Feels 61F. 30.07% ...,tennessee,"Nashville, TN, USA"
4,12,We used to use umbrellas to face the bad weath...,,Houston


In [3]:
### use twokenized data
twokenized_train = ps.read_fwf('train_tokenized.txt',header=None)
train['tweet'] = twokenized_train[0]
twokenized_test = []
twokenized_test = ps.read_fwf('ark-tweet-nlp-0.3.2/test_tokenized.txt',header=None)
test['tweet'] = twokenized_test[0]

In [11]:
tfidf = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3))
tfidf.fit(train['tweet'])

# tfidf.get_feature_names()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
X = tfidf.transform(train['tweet'])
y = np.array(train.iloc[:,4:])
X_test = tfidf.transform(test['tweet'].values.astype('U')) 

X_train, X_val, y_train, y_val = train_test_split(X, y)

print(y_train)

[[0.    0.    0.    ... 0.    0.    0.   ]
 [0.    0.    1.    ... 0.    0.    0.209]
 [0.    0.    0.6   ... 0.4   0.    0.   ]
 ...
 [0.    0.    1.    ... 0.    0.193 0.   ]
 [0.    0.193 0.404 ... 0.    0.    0.   ]
 [0.    1.    0.    ... 0.    0.    0.   ]]


# Random Forest

In [None]:
clf = RandomForestRegressor()

param_grid = {'max_depth': [None], 'min_samples_split' :[3, 10], 'min_samples_leaf' : [3, 10],
              'criterion':['mse']}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=4)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [None]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

# XGBoost

In [None]:
from sklearn.multioutput import MultiOutputRegressor
!pip install xgboost
from xgboost import XGBRegressor

In [None]:
multioutputregressor = MultiOutputRegressor(XGBRegressor(objective='reg:linear')).fit(X_train, y_train)

In [None]:
mor_pred = multioutputregressor.predict(X_val)
# Truncate
mor_pred[mor_pred > 1] = 1
mor_pred[mor_pred < 0] = 0

In [None]:
rmse = math.sqrt(mean_squared_error(y_val, mor_pred))
rmse # 0.16388598799139095

# Ridge Regression

In [13]:
clf = Ridge()

param_grid = {'alpha': [25, 10, 5, 2.5, 1, 0.75, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.001]}

clf = GridSearchCV(clf, param_grid=param_grid, cv=2, verbose=1)

In [14]:
clf.fit(X_train, y_train)

0.16302219221329986

In [None]:
y_pred = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1

In [None]:
rmse = math.sqrt(mean_squared_error(y_val, y_pred))
rmse

# Ensembling

In [None]:
# Split dataset for S, W and K classifications.
y_train_S = y_train[:,0:5]
y_train_W = y_train[:, 5:9]
y_train_K = y_train[:, 9:23]

In [None]:
# Predict Sentiment
clf.fit(X_train, y_train_S)
y_pred_S = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_S[y_pred_S < 0] = 0
y_pred_S[y_pred_S > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,0:5], y_pred_S))
# rmse

In [None]:
# Predict When
clf.fit(X_train, y_train_W)
y_pred_W = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_W[y_pred_W < 0] = 0
y_pred_W[y_pred_W > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,5:9], y_pred_W))
# rmse

In [None]:
# Predict Kind
clf.fit(X_train, y_train_K)
y_pred_K = clf.predict(X_val)
# Truncate predictions between 0 and 1
y_pred_K[y_pred_K < 0] = 0
y_pred_K[y_pred_K > 1] = 1
# rmse = math.sqrt(mean_squared_error(y_val[:,9:23], y_pred_K))
# rmse

In [None]:
# Ensemble


# Test Submission

In [15]:
test_prediction = clf.predict(X_test)
# Truncate predictions between 0 and 1
test_prediction[test_prediction < 0] = 0
test_prediction[test_prediction > 1] = 1

In [16]:
prediction = np.array(np.hstack([np.matrix(test['id']).T, test_prediction])) 
col = '%i,' + '%f,'*23 + '%f'
np.savetxt('data/output.txt', prediction,col, delimiter=',')

In [17]:
print('Train error: {0}'.format(np.sqrt(np.sum(np.array(np.array(clf.predict(X))-y)**2)/ (X.shape[0]*24.0))))



Train error: 0.14432603441695394
