In [1]:
import pandas as pd 
import string
import numpy as np
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk import precision,recall
import itertools

Read the files and put them in a pandas dataset.There are three different types: the train dataset(in order to train the classifier),the development set(in order to tune the hyperparameters of the desired algorithm),the test set(the set we will test the algorithm).

In [2]:
cols = ['id','sentiment','tweet']
tr_files=['twitter-2013train-A.tsv','twitter-2015train-A.tsv','twitter-2016train-A.tsv']
dev_files_te='twitter-2016dev-A.tsv'
dev_files_tr='twitter-2016devtest-A.tsv'
te_files='twitter-2016test-A.tsv'
list_=[]
for file_ in tr_files:
    df = pd.read_csv(file_,names=cols, header=None,sep='\t')
    list_.append(df)
df_train = pd.concat(list_)

df = pd.read_csv(te_files,names=cols, header=None,sep='\t')
df_test=df
    
df = pd.read_csv(dev_files_tr,names=cols, header=None,sep='\t')
df_dev_tr = df

df = pd.read_csv(dev_files_te,names=cols, header=None,sep='\t')
df_dev_te = df

print 'Total train tweets read ',len(df_train)
print 'Total test tweets read ',len(df_test)
print 'Total dev train tweets read ',len(df_dev_tr)
print 'Total dev test tweets read ',len(df_dev_te)

Total train tweets read  16045
Total test tweets read  20342
Total dev train tweets read  2000
Total dev test tweets read  489


Keep only the datasets that their tweets are available.Remove the ones that their tweet text says 'Not available'.

In [3]:
df_train=df_train[df_train.tweet!='Not Available']
df_test=df_test[df_test.tweet!='Not Available']
df_dev_te=df_dev_te[df_dev_te.tweet!='Not Available']
df_dev_tr =df_dev_tr[df_dev_tr.tweet!='Not Available']
print 'Sample of train data:',df_train.head()
print 'Sample of test data:',df_test.head()

Sample of train data:                    id sentiment  \
0  264183816548130816  positive   
3  264249301910310912  negative   
6  264105751826538497  positive   
7  264094586689953794  negative   
9  254941790757601280  negative   

                                               tweet  
0  Gas by my house hit $3.39!!!! I'm going to Cha...  
3  Iranian general says Israel's Iron Dome can't ...  
6  with J Davlar 11th. Main rivals are team Polan...  
7  Talking about ACT's &amp;&amp; SAT's, deciding...  
9  They may have a SuperBowl in Dallas, but Dalla...  
Sample of test data:                    id sentiment  \
0  619950566786113536   neutral   
1  619969366986235905   neutral   
3  619974445185302528   neutral   
4  619987808317407232  positive   
5  619994586182619136  positive   

                                               tweet  
0  Picturehouse's, Pink Floyd's, 'Roger Waters: T...  
1  Order Go Set a Watchman in store or through ou...  
3  If you could ask an onstage interview

Print the number of sentimens (positive,negative,neutral) that exist in each dataset.

In [4]:
print 'Sentiments in train:',df_train.sentiment.value_counts()
print 'Sentiments in test:',df_test.sentiment.value_counts()
print 'Sentiments in dev train test:',df_dev_tr.sentiment.value_counts()
print 'Sentiments in dev test:',df_dev_te.sentiment.value_counts()

Sentiments in train: neutral     5143
positive    5117
negative    1658
Name: sentiment, dtype: int64
Sentiments in test: neutral     7727
positive    5439
negative    2271
Name: sentiment, dtype: int64
Sentiments in dev train test: positive    777
neutral     539
negative    246
Name: sentiment, dtype: int64
Sentiments in dev test: neutral     199
positive    116
negative     41
Name: sentiment, dtype: int64


Keep only the neccessary columns and drop id column.

In [5]:
d_tr=df_train[['tweet','sentiment']]
d_te=df_test[['tweet','sentiment']]
dev_tr=df_dev_tr[['tweet','sentiment']]
dev_te=df_dev_te[['tweet','sentiment']]

Remove tweets that their text has any useless (for sentiment analysis purposes) mentions to other users,html links,hashtag symbol from hashtags and RT (retweets).

In [6]:
stopwords_set = set(stopwords.words("english"))
def clean_text(dataset):
    tweets=[]
    for index, row in dataset.iterrows():
        words_filtered = [e.lower() for e in row.tweet.split() if len(e) >= 3]
        words_cleaned = [word.translate(None, string.punctuation) for word in words_filtered
            if 'http' not in word
            and not word.startswith('@')
            and not word.startswith('#')
            and word != 'RT'
            and word not in stopwords_set]
        words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
        tweets.append((words_cleaned,row.sentiment))
    return tweets
        
train=clean_text(d_tr)
dev_tr=clean_text(dev_tr)
dev_te=clean_text(dev_te)
test=clean_text(d_te)

Train the classifier with the development set to find the best hyperparameters of the algorithm. Pipeline will execute the functions sequentially.We fit the dev train data to MultinomialNB classifier.The hyperparameters to tune the algorithm are :

ngram_range: The vocabulary of the algorithm will contain (1,1)->unigrams,(1,2)-> unigram and bigrams,(1,3)->unigrams,bigrams and trigrams.It is an argument pass to CountVectorizer function that converts a collection of texts to a matrix of token counts.

use_idf:If this is true,it uses the result of CountVectorizer into TfidfTransformer that transforms a count matrix to a normalized tf or tf-idf representation.Otherwise,this is skipped.

alpha:Additive (Laplace/Lidstone) smoothing parameter.

fit_prior: Whether to learn class prior probabilities or not. If false, a uniform prior will be used.


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score

text=[' '.join(x[0]) for x in dev_tr]
sent=[x[1] for x in dev_tr]
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
clf.fit(text,sent)
parameters = {'vect__ngram_range': [(1, 1), (1, 2),(1,3),(2,3),(2,4)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (0.1,0.01,0.001),
              'clf__fit_prior':(True,False),
}
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(text,sent)
print 'MultinomialNB has best score of ',gs_clf.best_score_,' and best parameters ',gs_clf.best_params_
test_text=[' '.join(x[0]) for x in dev_te]
target=[x[1] for x in dev_te]
result=clf.predict(test_text)

print(metrics.classification_report(target, result,['positive','negative','neutral']))
metrics.confusion_matrix(target,result)
print 'Accuracy ',accuracy_score(result,target)

MultinomialNB has best score of  0.446862996159  and best parameters  {'vect__ngram_range': (2, 3), 'clf__fit_prior': True, 'tfidf__use_idf': True, 'clf__alpha': 0.1}
             precision    recall  f1-score   support

   positive       0.34      0.98      0.50       116
   negative       0.00      0.00      0.00        41
    neutral       0.69      0.06      0.10       199

avg / total       0.49      0.35      0.22       356

Accuracy  0.351123595506


  'precision', 'predicted', average, warn_for)


Use the optimal parameters in order to fit the training data to the MultinomialNB and then apply the test set and measure the results.

In [8]:
clf=[]
if gs_clf.best_params_['tfidf__use_idf']==True:
    clf = Pipeline([('vect', CountVectorizer(gs_clf.best_params_['vect__ngram_range'])),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(gs_clf.best_params_['clf__alpha'])),
])
else:
    clf = Pipeline([('vect', CountVectorizer(gs_clf.best_params_['vect__ngram_range'])),
                     ('clf', MultinomialNB(gs_clf.best_params_['clf__alpha'])),
])
text=[' '.join(x[0]) for x in train]
sent=[x[1] for x in train]

clf.fit(text,sent)
print 'Finished the fitting.'

Finished the fitting.


In [9]:
test_text=[' '.join(x[0]) for x in test]
target=[x[1] for x in test]
result=clf.predict(test_text)

Calculate various metric for the algorithm:

precision: counts the true positives(how useful the results are).If the algorithm return 10 positive tweets and 5 of them are positive,the precision is 5/10=0.5.

recall: counts the false positives(how complete the results are).If the algorithm return 10 positive tweets and 5 of them are indeed positive,while all of the positive tweets are 20 then the recall is 10/20.

f1-score:The F1 score is the harmonic average of the precision and recall. Measures test accuracy of the algorithm.

In [10]:
print(metrics.classification_report(target, result,['positive','negative','neutral']))
metrics.confusion_matrix(target,result)
print 'Accuracy ',accuracy_score(result,target)

             precision    recall  f1-score   support

   positive       0.48      0.67      0.56      5439
   negative       0.50      0.19      0.28      2271
    neutral       0.60      0.54      0.57      7727

avg / total       0.54      0.54      0.52     15437

Accuracy  0.536891883138


<h4>I decided to use the MultinomialNB algorithm as it is the one the is generally most appropriate for text sentiment analysis as it produced slightly better results without requiring excessive amount of time to be run. The features are selected based on the GridSearch and in most occassions using bigram and trigram helped improved the overal performance of the algorithm.</h4>