In [97]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB

from sklearn import preprocessing,decomposition,metrics
from sklearn.decomposition import TruncatedSVD
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm

#### Data Loading And Preperation

    -- Removing hashtags and converting to normal words, appended at end of sentence.
    -- Adding a seperate  method for calculating multi class logloss [multiclass_logloss]

In [2]:
input_data=pd.read_csv("dataset/train.csv")
test_data=pd.read_csv("dataset/test.csv")

In [3]:
input_data.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1


In [4]:
accuracy_metrics={}

In [5]:
input_data['lang'].unique()

array(['en', '-0.0138325017', '-0.9677309496', '-0.3876905537',
       '0.5309553602', '-0.045423609', '0.1210638815', '&gt',
       ' have them delivered!', '-0.7860764746',
       ' very much loved\U0001f970️ …', '0.7885519508', '0.4310598662',
       '0.6034925894', '0.8837056921', '-0.4757848717', '0.1262837865',
       '0.8296402421', '0.2203775303', '-0.0320226838', '-0.0272467108',
       '-0.9022044897', 'I was ten weeks...', '0.4754834129', ' ️',
       '0.7120802873', '0.7493660991', '0.3716244571', '0.4616286043',
       '0.4479350131', ' PROSPERITY KING ZAY I MET YOU WHEN YOU WAS &amp',
       '0.6747864639', '-0.7798220898', '-0.6668237899', '-0.948781497',
       '-0.4189029043', '0.001710524', '0.0903948317', '-0.6936114103',
       '0.3087589587', " here's to !", '-0.7182082972', '0.195401415',
       '-0.0064143617', ' pink Peruvian opal! via', '0.8077853046',
       '0.5129957209', '-0.1651444775', '0.4611910293',
       ' look really confused 🤷\u200d️ i realize I’m A

In [6]:
input_data['lang'].value_counts()

en                            2994
 pink Peruvian opal! via         4
 Find More                       2
&gt                              2
WORLDS OKAYEST MOTHER! &lt       2
                              ... 
0.3716244571                     1
-0.3410990252                    1
-0.4757848717                    1
0.3352327468                     1
0.8644551076                     1
Name: lang, Length: 232, dtype: int64

In [7]:
len(input_data)

3235

In [8]:
input_data['original_text'].isnull().sum()

0

In [9]:
#getting hashtags in a seperate column
def get_hashtags(input_str):
    #print(input_str)
    hash_ls=re.findall(r"#(\w+)", input_str)
    #print(hash_ls)
    hash_replaced_str=input_str.replace("#"," ")
    #print(hash_replaced_str)
    #return [hash_replaced_str,(hash_ls)]
    hash_ls_str=""
    if len(hash_ls)>0:
        hash_ls_str=" ".join(hash_ls).strip()
    
    #return pd.Series([hash_replaced_str,hash_ls_str], index=['comment_wo_hash','hashtags'])
    return hash_replaced_str+" : "+hash_ls_str

#vectorizing 
sample_str="Happy #MothersDay #love#mother to all you amazing mothers "
#new_str,hash_ls=get_hashtags(sample_str)
#new_str,hash_ls
new_str=get_hashtags(sample_str)
new_str

'Happy  MothersDay  love mother to all you amazing mothers  : MothersDay love mother'

In [10]:
input_data['original_text']

0       Happy #MothersDay to all you amazing mothers o...
1       Happy Mothers Day Mum - I'm sorry I can't be t...
2       Happy mothers day To all This doing a mothers ...
3       Happy mothers day to this beautiful woman...ro...
4       Remembering the 3 most amazing ladies who made...
                              ...                        
3230    To all my sisters ,my sisters -in -law and als...
3231    Happy Mother’s Day to all the Mums, Step Mums,...
3232    Happy Mothers Day to the craziest woman I know...
3233    Happy Mother's Day to my amazing wife! We both...
3234    Wishing you all a safe & happy Mothers Day #mo...
Name: original_text, Length: 3235, dtype: object

In [11]:
#input_data[['comment_wo_hash','hashtags']]=input_data.apply(lambda row: get_hashtags(row['original_text']),axis=1)
input_data['comment_wo_hash_n_hashtags']=input_data.apply(lambda row: get_hashtags(row['original_text']),axis=1)

In [12]:
input_data['comment_wo_hash_n_hashtags']

0       Happy  MothersDay to all you amazing mothers o...
1       Happy Mothers Day Mum - I'm sorry I can't be t...
2       Happy mothers day To all This doing a mothers ...
3       Happy mothers day to this beautiful woman...ro...
4       Remembering the 3 most amazing ladies who made...
                              ...                        
3230    To all my sisters ,my sisters -in -law and als...
3231    Happy Mother’s Day to all the Mums, Step Mums,...
3232    Happy Mothers Day to the craziest woman I know...
3233    Happy Mother's Day to my amazing wife! We both...
3234    Wishing you all a safe & happy Mothers Day  mo...
Name: comment_wo_hash_n_hashtags, Length: 3235, dtype: object

In [13]:
y_t=input_data['sentiment_class']

In [14]:
#Train test split==
xtrain, xvalid, ytrain, yvalid = train_test_split(input_data['comment_wo_hash_n_hashtags'].values, y_t, 
                                                  stratify=y_t, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [15]:
print(xtrain.shape)
xvalid.shape

(2911,)


(324,)

In [16]:
import numpy as np
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

# Tf Idf with Logictic regressor

In [17]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [18]:
tvf_dict={}

In [19]:
%%time
tfv.fit(xtrain)

Wall time: 704 ms


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=3, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=1, stop_words='english', strip_accents='unicode',
                sublinear_tf=1, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=1, vocabulary=None)

In [20]:
x_train_tvf_comments_hashtags=tfv.transform(xtrain)
x_val_tvf_comments_hashtags=tfv.transform(xvalid)

In [21]:
#x_train_tvf_comments_hashtags=tfv.transform(input_data['comment_wo_hash_n_hashtags'])
#x_train_tvf_hashtags=tfv.transform(input_data['hashtags'])
x_train_tvf_comments_hashtags

<2911x4383 sparse matrix of type '<class 'numpy.float64'>'
	with 88630 stored elements in Compressed Sparse Row format>

In [22]:
x_val_tvf_comments_hashtags

<324x4383 sparse matrix of type '<class 'numpy.float64'>'
	with 9440 stored elements in Compressed Sparse Row format>

In [23]:
#train a classifier ==> logistic regressor
#simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(x_train_tvf_comments_hashtags, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
y_val_preds=clf.predict(x_val_tvf_comments_hashtags)
#y_val_preds
y_predictions=clf.predict_proba(x_val_tvf_comments_hashtags)

In [25]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, y_predictions))
zx_1=multiclass_logloss(yvalid, y_predictions)

logloss: 1.273 


In [26]:

zx=log_loss(yvalid, y_predictions)
zx

1.0459777482733583

In [27]:
tvf_dict['multiclass_logloss']=zx_1
tvf_dict['logloss']=zx
accuracy_metrics['Tf_Idf_Linear_regressor']=tvf_dict

In [28]:
test_data.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author
0,1.246628e+18,"3. Yeah, I once cooked potatoes when I was 3 y...",en,0,LToddWood
1,1.245898e+18,"Happy Mother's Day to all the mums, step-mums,...",en,0,iiarushii
2,1.244717e+18,"I love the people from the UK, however, when I...",en,0,andreaanderegg
3,1.24573e+18,Happy 81st Birthday Happy Mother’s Day to my m...,en,1,TheBookTweeters
4,1.244636e+18,Happy Mothers day to all those wonderful mothe...,en,0,andreaanderegg


In [29]:
test_data['comment_wo_hash_n_hashtags']=test_data.apply(lambda row: get_hashtags(row['original_text']),axis=1)
x_test_tvf_comments_hashtags=tfv.transform(test_data['comment_wo_hash_n_hashtags'])
#make predictions here
y_test=clf.predict_proba(x_test_tvf_comments_hashtags)
y_test_pred=clf.predict(x_test_tvf_comments_hashtags)

In [30]:
y_test

array([[0.39789397, 0.37594803, 0.226158  ],
       [0.25979267, 0.48841151, 0.25179581],
       [0.21448495, 0.39803834, 0.38747671],
       ...,
       [0.26814156, 0.60116594, 0.1306925 ],
       [0.2658526 , 0.49144289, 0.2427045 ],
       [0.28245283, 0.4514692 , 0.26607797]])

In [31]:
y_test_pred

array([-1,  0,  0, ...,  0,  0,  0], dtype=int64)

In [32]:
accuracy_metrics

{'Tf_Idf_Linear_regressor': {'multiclass_logloss': 1.2727885150962162,
  'logloss': 1.0459777482733583}}

# count Vectorizer with Logistic Rgression (Linear regressor) 

In [33]:
#ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 3), stop_words = 'english')
count_vector = CountVectorizer(analyzer='word',ngram_range=(1, 3), stop_words = 'english')

In [34]:
count_vector.fit(xtrain)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [35]:
x_train_countV=count_vector.transform(xtrain)
x_val_countV=count_vector.transform(xvalid)

In [36]:
clf = LogisticRegression(C=1.0)
clf.fit(x_train_countV, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
y_val_preds=clf.predict(x_val_countV)
#y_val_preds
y_predictions=clf.predict_proba(x_val_countV)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, y_predictions))
zx_1=multiclass_logloss(yvalid, y_predictions)
zx=log_loss(yvalid, y_predictions)
zx

logloss: 1.695 


1.2357009321133792

In [38]:
count_vec_dict={}
count_vec_dict['multiclass_logloss']=zx_1
count_vec_dict['logloss']=zx
accuracy_metrics['Count_vectorizer_Linear_regressor']=count_vec_dict

In [39]:
accuracy_metrics

{'Tf_Idf_Linear_regressor': {'multiclass_logloss': 1.2727885150962162,
  'logloss': 1.0459777482733583},
 'Count_vectorizer_Linear_regressor': {'multiclass_logloss': 1.695275339627342,
  'logloss': 1.2357009321133792}}

In [40]:
x_test_count_vector=count_vector.transform(test_data['comment_wo_hash_n_hashtags'])
#make predictions here
y_test=clf.predict_proba(x_test_count_vector)
y_test_pred=clf.predict(x_test_count_vector)
y_test_pred

array([-1,  0,  1, ...,  0,  0,  0], dtype=int64)

In [41]:
y_test

array([[0.39837861, 0.32432741, 0.27729398],
       [0.24237964, 0.6456318 , 0.11198855],
       [0.12238613, 0.23259882, 0.64501505],
       ...,
       [0.15579101, 0.81238607, 0.03182291],
       [0.26619615, 0.62176289, 0.11204096],
       [0.32444617, 0.4688647 , 0.20668913]])

# Tf-Idf Naive Bayes

In [42]:
nm_clf=MultinomialNB()
nm_clf.fit(x_train_tvf_comments_hashtags, ytrain)
predictions = nm_clf.predict_proba(x_val_tvf_comments_hashtags)

In [43]:
zx_1=multiclass_logloss(yvalid, predictions)
print("logloss: %0.3f " % zx_1)
zx=log_loss(yvalid, predictions)
zx

logloss: 1.510 


1.1096154051858753

In [44]:
tfidf_NB={}
tfidf_NB['multiclass_logloss']=zx_1
tfidf_NB['logloss']=zx
accuracy_metrics['Tf_Idf_Naive_Bayes']=tfidf_NB

In [45]:
accuracy_metrics

{'Tf_Idf_Linear_regressor': {'multiclass_logloss': 1.2727885150962162,
  'logloss': 1.0459777482733583},
 'Count_vectorizer_Linear_regressor': {'multiclass_logloss': 1.695275339627342,
  'logloss': 1.2357009321133792},
 'Tf_Idf_Naive_Bayes': {'multiclass_logloss': 1.5102103818934163,
  'logloss': 1.1096154051858753}}

# Count Vectorizer Naive Bayes

In [46]:
nb_cv_clf=MultinomialNB()
#x_train_countV
#x_val_countV
nb_cv_clf.fit(x_train_countV, ytrain)

y_val_preds=nb_cv_clf.predict(x_val_countV)
#y_val_preds
y_predictions=nb_cv_clf.predict_proba(x_val_countV)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, y_predictions))
zx_1=multiclass_logloss(yvalid, y_predictions)
zx=log_loss(yvalid, y_predictions)
zx


logloss: 9.809 


6.226935517438707

In [47]:
type(y_predictions)


numpy.ndarray

In [48]:
CV_NB={}
CV_NB['multiclass_logloss']=zx_1
CV_NB['logloss']=zx
accuracy_metrics['Count_vector_Naive_Bayes']=CV_NB

# SVM model - simple, without Truncated singular value decomposition and latent semantic analysis

Truncated singular value decomposition and latent semantic analysis : https://scikit-learn.org/stable/modules/decomposition.html#lsa

When truncated SVD is applied to term-document matrices (as returned by CountVectorizer or TfidfVectorizer), this transformation is known as latent semantic analysis (LSA), because it transforms such matrices to a “semantic” space of low dimensionality. In particular, LSA is known to combat the effects of synonymy and polysemy (both of which roughly mean there are multiple meanings per word), which cause term-document matrices to be overly sparse and exhibit poor similarity under measures such as cosine similarity.


Note Most treatments of LSA in the natural language processing (NLP) and information retrieval (IR) literature swap the axes of the matrix  so that it has shape n_features × n_samples. We present LSA in a different way that matches the scikit-learn API better, but the singular values found are the same.

In [49]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

### TF IDF vectorizer 

In [50]:
%%time
print(ytrain.shape)
print(x_train_tvf_comments_hashtags.todense().shape)
#Standard Scaler to standardize data, before applying SVM.
svm_clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
svm_clf.fit(x_train_tvf_comments_hashtags.todense(), ytrain)

(2911,)
(2911, 4383)
Wall time: 10min 59s


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='auto',
                     kernel='rbf', max_iter=-1, probability=True,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [51]:
y_predictions_proba=svm_clf.predict_proba(x_val_tvf_comments_hashtags.todense())
zx_1=multiclass_logloss(yvalid, y_predictions_proba)
zx=log_loss(yvalid, y_predictions_proba)
print(zx_1,zx)

1.2493673873149678 1.019894264732681


In [52]:
Tf_Idf_SVM_basic={}
Tf_Idf_SVM_basic['multiclass_logloss']=zx_1
Tf_Idf_SVM_basic['logloss']=zx
accuracy_metrics['Tf_Idf_SVM_basic']=Tf_Idf_SVM_basic

In [53]:
accuracy_metrics

{'Tf_Idf_Linear_regressor': {'multiclass_logloss': 1.2727885150962162,
  'logloss': 1.0459777482733583},
 'Count_vectorizer_Linear_regressor': {'multiclass_logloss': 1.695275339627342,
  'logloss': 1.2357009321133792},
 'Tf_Idf_Naive_Bayes': {'multiclass_logloss': 1.5102103818934163,
  'logloss': 1.1096154051858753},
 'Count_vector_Naive_Bayes': {'multiclass_logloss': 9.809364848044815,
  'logloss': 6.226935517438707},
 'Tf_Idf_SVM_basic': {'multiclass_logloss': 1.2493673873149678,
  'logloss': 1.019894264732681}}

### Count vectorizer 

In [54]:
x_train_countV.shape , ytrain.shape # x_train_countV, ytrain
#x_val_countV yvalid

((2911, 100708), (2911,))

%%time
'''taking too much time to run. Will add max_iter and verbose as algo may not be converging '''

svm_clf_cv = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
svm_clf_cv.fit(x_train_countV.todense(), ytrain)
#nb_cv_clf.fit(x_train_countV, ytrain)

%%time
y_val_preds=svm_clf_cv.predict_proba(x_val_countV.todense())
#y_val_preds
#y_predictions=nb_cv_clf.predict_proba(x_val_countV)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, y_val_preds))
zx_1=multiclass_logloss(yvalid, y_val_preds)
zx=log_loss(yvalid, y_val_preds)
zx

# TruncatedSVM model - simple Truncated singular value decomposition and latent semantic analysis + Strandadized Data


Truncating svm does not cause a significant decrease in model performance, but a very high improvement in execution time.

### Tf-IDF

In [55]:
%%time
#print(ytrain.shape)
#print(x_train_tvf_comments_hashtags.todense().shape)

#x_train_tvf_comments_hashtags=tfv.transform(xtrain)
#x_val_tvf_comments_hashtags=tfv.transform(xvalid)

#Standard Scaler to standardize data, before applying SVM.
#svm_clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
#svm_clf.fit(x_train_tvf_comments_hashtags.todense(), ytrain)


# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=150)
svd.fit(x_train_tvf_comments_hashtags)

#transforming the Tf-IDF vectors to svd vectors.
xtrain_svd = svd.transform(x_train_tvf_comments_hashtags)
xvalid_svd = svd.transform(x_val_tvf_comments_hashtags)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

Wall time: 939 ms


In [56]:
%%time
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.248 
Wall time: 26.3 s


In [57]:
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

Tf_Idf_trunc_SVM_LSA={}
Tf_Idf_trunc_SVM_LSA['multiclass_logloss']=zx_1
Tf_Idf_trunc_SVM_LSA['logloss']=zx
accuracy_metrics['Tf_Idf_trunc_SVM_LSA']=Tf_Idf_trunc_SVM_LSA

1.2475833991381615 1.0201701849581735


### Count Vectorizer

In [58]:
%%time
svd_cv = decomposition.TruncatedSVD(n_components=150)
svd_cv.fit(x_train_countV)


#x_train_countV=count_vector.transform(xtrain)
#x_val_countV=count_vector.transform(xvalid)

#transforming the Tf-IDF vectors to svd vectors.
xtrain_svd_cv = svd_cv.transform(x_train_countV)
xvalid_svd_cv = svd_cv.transform(x_val_countV)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl_cv = preprocessing.StandardScaler()
scl_cv.fit(xtrain_svd_cv)
xtrain_svd_scl_cv = scl_cv.transform(xtrain_svd_cv)
xvalid_svd_scl_cv = scl_cv.transform(xvalid_svd_cv)

Wall time: 9.67 s


In [59]:
%%time
clf_svm_cv = SVC(C=1.0, probability=True) # since we need probabilities
clf_svm_cv.fit(xtrain_svd_scl_cv, ytrain)
predictions = clf_svm_cv.predict_proba(xvalid_svd_scl_cv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.249 
Wall time: 25.8 s


In [60]:
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

cv_trunc_SVM_LSA={}
cv_trunc_SVM_LSA['multiclass_logloss']=zx_1
cv_trunc_SVM_LSA['logloss']=zx
accuracy_metrics['CV_trunc_SVM_LSA']=cv_trunc_SVM_LSA

1.249167774079908 1.0216142932120083


In [61]:
accuracy_metrics

{'Tf_Idf_Linear_regressor': {'multiclass_logloss': 1.2727885150962162,
  'logloss': 1.0459777482733583},
 'Count_vectorizer_Linear_regressor': {'multiclass_logloss': 1.695275339627342,
  'logloss': 1.2357009321133792},
 'Tf_Idf_Naive_Bayes': {'multiclass_logloss': 1.5102103818934163,
  'logloss': 1.1096154051858753},
 'Count_vector_Naive_Bayes': {'multiclass_logloss': 9.809364848044815,
  'logloss': 6.226935517438707},
 'Tf_Idf_SVM_basic': {'multiclass_logloss': 1.2493673873149678,
  'logloss': 1.019894264732681},
 'Tf_Idf_trunc_SVM_LSA': {'multiclass_logloss': 1.2475833991381615,
  'logloss': 1.0201701849581735},
 'CV_trunc_SVM_LSA': {'multiclass_logloss': 1.249167774079908,
  'logloss': 1.0216142932120083}}

# XGBoost 

### Tf-IDF

In [67]:
import xgboost as xgb
# Fitting a simple xgboost on tf-idf
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1)
xgb_clf.fit(x_train_tvf_comments_hashtags.tocsc(), ytrain)
#tocsc() :: Compressed Sparse Column format
predictions = xgb_clf.predict_proba(x_val_tvf_comments_hashtags.tocsc())

In [64]:
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

Tf_Idf_XGB={}
Tf_Idf_XGB['multiclass_logloss']=zx_1
Tf_Idf_XGB['logloss']=zx
accuracy_metrics['Tf_Idf_XGB']=Tf_Idf_XGB

1.3760362689233856 1.0965642127909778


#### # Fitting a simple xgboost on tf-idf svd features 

In [72]:
xgb_clf_svd = xgb.XGBClassifier(nthread=10)
#Unscaled
#xtrain_svd
xgb_clf_svd.fit(xtrain_svd, ytrain)
predictions = xgb_clf_svd.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

xgb_svd_tfidf={}
xgb_svd_tfidf['multiclass_logloss']=zx_1
xgb_svd_tfidf['logloss']=zx
accuracy_metrics['xgb_svd_tfidf']=xgb_svd_tfidf

logloss: 1.884 
1.8838025765424524 1.3058248953606335


#### # Fitting a simple xgboost on tf-idf svd features _Scaled features

In [74]:
xgb_clf_svd_scaled = xgb.XGBClassifier(nthread=10)
#Scaled
#xtrain_svd
xgb_clf_svd_scaled.fit(xtrain_svd_scl, ytrain)
predictions = xgb_clf_svd_scaled.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

xgb_svd_tfidf_scaled={}
xgb_svd_tfidf_scaled['multiclass_logloss']=zx_1
xgb_svd_tfidf_scaled['logloss']=zx
accuracy_metrics['xgb_svd_tfidf_scaled']=xgb_svd_tfidf_scaled


logloss: 1.884 
1.8838025765424524 1.3058248953606335


### Count Vectorizer

In [66]:
xgb_clf_cv = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1)
xgb_clf_cv.fit(x_train_countV.tocsc(), ytrain)
#tocsc() :: Compressed Sparse Column format
predictions_cv = xgb_clf_cv.predict_proba(x_val_countV.tocsc())

In [68]:
zx_1=multiclass_logloss(yvalid, predictions_cv)
zx=log_loss(yvalid, predictions_cv)
print(zx_1,zx)

cv_XGB={}
cv_XGB['multiclass_logloss']=zx_1
cv_XGB['logloss']=zx
accuracy_metrics['Tf_Idf_XGB']=cv_XGB

1.3683314268235807 1.0621958973783034


#### # Fitting a simple xgboost on cv svd features unscaled features

In [75]:
#xtrain_svd_cv
#xvalid_svd_cv

xgb_clf_svd_cv = xgb.XGBClassifier(nthread=10)
#Un-scaled
#xtrain_svd
xgb_clf_svd_cv.fit(xtrain_svd_cv, ytrain)
predictions = xgb_clf_svd_cv.predict_proba(xvalid_svd_cv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

xgb_svd_cv={}
xgb_svd_cv['multiclass_logloss']=zx_1
xgb_svd_cv['logloss']=zx
accuracy_metrics['xgb_svd_cv']=xgb_svd_cv


logloss: 1.869 
1.868590195858736 1.3226814595407541


#### # Fitting a simple xgboost on cv svd features unscaled features

In [76]:
xgb_clf_svd_cv_scaled = xgb.XGBClassifier(nthread=10)
#Scaled
#xtrain_svd
xgb_clf_svd_cv_scaled.fit(xtrain_svd_scl_cv, ytrain)
predictions = xgb_clf_svd_cv_scaled.predict_proba(xvalid_svd_scl_cv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
zx_1=multiclass_logloss(yvalid, predictions)
zx=log_loss(yvalid, predictions)
print(zx_1,zx)

xgb_svd_cv_scaled={}
xgb_svd_cv_scaled['multiclass_logloss']=zx_1
xgb_svd_cv_scaled['logloss']=zx
accuracy_metrics['xgb_svd_cv_scaled']=xgb_svd_cv_scaled

logloss: 1.894 
1.893945956022053 1.3612558731153883


# hyperparameter optimizations on XGBoost

In [80]:
##SCORING FUNCTION
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [86]:
# Initialize SVD
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])

In [87]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}

#### Tf-Idf grid search

In [90]:
%%time
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(x_train_tvf_comments_hashtags, ytrain)  # we can use the full data here but im only using xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   4 out of  24 | elapsed:    5.8s remaining:   29.3s
[Parallel(n_jobs=-1)]: Done   7 out of  24 | elapsed:    5.9s remaining:   14.4s
[Parallel(n_jobs=-1)]: Done  10 out of  24 | elapsed:    6.0s remaining:    8.4s
[Parallel(n_jobs=-1)]: Done  13 out of  24 | elapsed:    6.9s remaining:    5.8s
[Parallel(n_jobs=-1)]: Done  16 out of  24 | elapsed:    7.2s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done  19 out of  24 | elapsed:    7.4s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed:    7.9s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    7.9s finished


Best score: -1.259
Best parameters set:
	lr__C: 0.1
	lr__penalty: 'l1'
	svd__n_components: 120
Wall time: 8.94 s




In [96]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100,250]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=6)

# Fit Grid Search Model
model.fit(x_train_tvf_comments_hashtags, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 6 folds for each of 7 candidates, totalling 42 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  24 out of  42 | elapsed:   20.1s remaining:   15.0s
[Parallel(n_jobs=-1)]: Done  29 out of  42 | elapsed:   20.2s remaining:    9.0s


Best score: -1.283
Best parameters set:
	nb__alpha: 250


[Parallel(n_jobs=-1)]: Done  34 out of  42 | elapsed:   20.3s remaining:    4.7s
[Parallel(n_jobs=-1)]: Done  39 out of  42 | elapsed:   20.4s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:   20.5s finished


# Word Vectors 

In [99]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('G:\Projects\Glove Wectors\glove.6B.300d.txt',encoding="utf-8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

313555it [01:33, 3363.62it/s]


KeyboardInterrupt: 

In [None]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())