## Reading the file with pandas

In [2]:
import pandas as pd

In [3]:
colnames = ["sentiment", "tweet"]

In [4]:
tweets = pd.read_csv('twitter-2013train-A.tsv', sep='\t', header=None, usecols=[1,2], names=colnames)

In [5]:
tweets = tweets.drop(tweets[tweets["tweet"] == "Not Available"].index)

In [6]:
len(tweets)

7049

In [7]:
mapping = {'positive': 1, 'negative': -1, 'neutral': 0 }

In [8]:
tweets = tweets.replace({'sentiment': mapping})

In [9]:
len(tweets)

7049

In [10]:
X = tweets["tweet"]

In [11]:
y = tweets["sentiment"]

## Read all files in one df

In [12]:
tweets2015 = pd.read_csv('twitter-2015train-A.tsv', sep='\t', header=None, usecols=[1,2], names=colnames)

In [13]:
tweets2015 = tweets2015.drop(tweets2015[tweets2015["tweet"] == "Not Available"].index)

In [14]:
tweets2015 = tweets2015.replace({'sentiment': mapping})

In [15]:
len(tweets2015)

356

In [16]:
tweets2016 = pd.read_csv('twitter-2016train-A.tsv', sep='\t', header=None, usecols=[1,2], names=colnames)

In [17]:
tweets2016 = tweets2016.drop(tweets2016[tweets2016["tweet"] == "Not Available"].index)

In [18]:
tweets2016 = tweets2016.replace({'sentiment': mapping})

In [19]:
len(tweets2016)

4513

In [20]:
type(tweets2016)

pandas.core.frame.DataFrame

In [21]:
all_tweets = pd.concat([tweets, tweets2015, tweets2016])

In [22]:
len(all_tweets)

11918

In [23]:
#all_tweets

In [24]:
X = all_tweets["tweet"]

In [25]:
y = all_tweets["sentiment"]

## Create vector from text

### Bow

In [26]:
from nltk.stem.snowball import SnowballStemmer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
stemmer = SnowballStemmer('english')

In [29]:
from nltk.corpus import stopwords
import string

In [30]:
sw = set(stopwords.words('english')) 
punctuation = set(string.punctuation)

In [31]:
def isStopWord(word): 
    return word in sw or word in punctuation 

In [32]:
def word_tokenize(text):
    words = text.split(' ')
    return [stemmer.stem(word) for word in words if not isStopWord(word.lower())]

In [33]:
vect1 = CountVectorizer(stop_words='english', analyzer=word_tokenize)

In [34]:
res = vect1.fit_transform(X)

In [35]:
res.shape

(11918, 37472)

In [36]:
res[0,:].mean()

0.000266865926558497

## Tf-idf

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
vect_tfidf = TfidfVectorizer(stop_words='english', analyzer=word_tokenize)

In [39]:
res = vect_tfidf.fit_transform(X)

In [40]:
res.shape

(11918, 37472)

In [41]:
type(res)

scipy.sparse.csr.csr_matrix

In [42]:
res[0.:].mean()

9.0301423945613995e-05

## Machine learning algorithm

#### using bow

In [43]:
from sklearn.naive_bayes import MultinomialNB

In [44]:
y.value_counts(normalize=True)

 0    0.431532
 1    0.429351
-1    0.139117
Name: sentiment, dtype: float64

In [45]:
params = {'vect__ngram_range':[(1,1), (1,2)], \
          'vect__max_features': [1000, 10000],\
          'vect__stop_words': [None, 'english']}

In [46]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer()), ('classify', MultinomialNB())])

In [47]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipeline, params)

In [48]:
gridsearch.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproces...=None, vocabulary=None)), ('classify', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'vect__stop_words': [None, 'english'], 'vect__max_features': [1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
print gridsearch.best_score_, gridsearch.best_params_

0.5217318342 {'vect__ngram_range': (1, 1), 'vect__stop_words': 'english', 'vect__max_features': 1000}


#### using tf-idf

In [50]:
pipeline_tfidf = Pipeline([('vect', TfidfVectorizer()), ('classify', MultinomialNB())])

In [51]:
gridsearch_tfidf = GridSearchCV(pipeline_tfidf, params)

In [52]:
gridsearch_tfidf.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True...        vocabulary=None)), ('classify', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'vect__stop_words': [None, 'english'], 'vect__max_features': [1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
print gridsearch_tfidf.best_score_, gridsearch_tfidf.best_params_

0.526178889075 {'vect__ngram_range': (1, 2), 'vect__stop_words': None, 'vect__max_features': 1000}


#### Using bow and tf-idf together

In [54]:
from sklearn.pipeline import FeatureUnion

In [55]:
featurizer = FeatureUnion( [('tfidf_vect', TfidfVectorizer()), ('count_vect', CountVectorizer())])

In [56]:
res = featurizer.fit_transform(X)

In [57]:
res.shape

(11918, 55268)

In [58]:
res[0,:].mean()

0.00027500518009579319

In [59]:
params = {'featurizer__count_vect__ngram_range':[(1,1), (1,2)], \
          'featurizer__count_vect__max_features': [1000, 10000],\
          'featurizer__count_vect__stop_words': [None, 'english'],\
          'featurizer__tfidf_vect__ngram_range':[(1,1), (1,2)], \
          'featurizer__tfidf_vect__max_features': [1000, 10000],\
          'featurizer__tfidf_vect__stop_words': [None, 'english']}

In [60]:
pipeline = Pipeline([('featurizer', featurizer), ('clasify', MultinomialNB())])

In [61]:
gridsearch = GridSearchCV(pipeline, params)

In [62]:
gridsearch.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('featurizer', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf_vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
     ...transformer_weights=None)), ('clasify', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'featurizer__tfidf_vect__max_features': [1000, 10000], 'featurizer__tfidf_vect__stop_words': [None, 'english'], 'featurizer__count_vect__stop_words': [None, 'english'], 'featurizer__count_vect__ngram_range': [(1, 1), (1, 2)], 'featurizer__count_vect__max_features': [1000, 10000], 'featurizer__tfidf_vect__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [63]:
print gridsearch.best_score_, gridsearch.best_params_

0.529283436818 {'featurizer__tfidf_vect__max_features': 10000, 'featurizer__tfidf_vect__stop_words': None, 'featurizer__count_vect__stop_words': None, 'featurizer__count_vect__ngram_range': (1, 2), 'featurizer__count_vect__max_features': 1000, 'featurizer__tfidf_vect__ngram_range': (1, 2)}


## Testing with test set

In [64]:
tweets2016_test = pd.read_csv('twitter-2016test-A.tsv', sep='\t', header=None, usecols=[1,2], names=colnames)

In [65]:
tweets2016_test = tweets2016_test.drop(tweets2016_test[tweets2016_test["tweet"] == "Not Available"].index)

In [66]:
tweets2016_test = tweets2016_test.replace({'sentiment': mapping})

In [67]:
len(tweets2016_test)

6353

In [68]:
X = tweets2016_test["tweet"]

In [69]:
len(X)

6353

In [70]:
y = tweets2016_test["sentiment"]

In [71]:
len(y)

6353

In [72]:
y_pred = gridsearch.predict(X)
gridsearch.predict(["flying with @united is not a great experience"])


array([1])

In [73]:
from sklearn.metrics import classification_report
print classification_report(y, y_pred)

             precision    recall  f1-score   support

         -1       0.37      0.33      0.35       987
          0       0.65      0.60      0.62      3292
          1       0.54      0.63      0.58      2074

avg / total       0.57      0.57      0.57      6353



## Briefly justification for the choices made:
#### It is used the naive bayes classifier due to the fact we have three categories (positive, neutral, negative) and this classifier can support this. Also due to the fact that we have a huge number of features, naive bayes is a good choice.
#### We used a combination of bow and tfidf features. This choice was made in order to have many more features that classifier can use.

## Using word embedings (TODO)

## Question B: Applying an existing application for sentiment analysis on twitter data.
### (only test examples).

In [74]:
import pattern

In [75]:
from pattern.en import sentiment

In [76]:
print(X)

0       Is Caitlyn Jenner finally ready to put on a ba...
1       WWE Raw results from Seattle, Washington (Augu...
2       Just got done watching wwe Monday night raw I'...
3       Just got done watching wwe Monday night raw I'...
5       Moto G (3rd Gen) Review: The new Moto G is one...
7       Mets: David Wright 1-for-3 with walk and run s...
9       Forever jealous of those who are going to the ...
10      New York Mets 3B David Wright went 1-for-3 wit...
13      Coach Quesada at 1st getting a little air time...
14      I may be sleep deprived but I promise you that...
15      I'm going to Dunkin tomorrow and ordering 10 i...
16      Didn't wake up with Frank Ocean's voice, imma ...
17      #RAW looks like Rollins got himself a date w/ ...
19      @soyeounnie in sorry but my aunt listens to al...
20      I finished first at TimePlay when I saw Ant-Ma...
21      Wife is trying to sell her Sam Smith tickets f...
22      #Mets 3B David Wright went 1 for 3 with a walk...
23      oh my 

In [77]:
y_pred = []
for tweet in X:
    polarity, subjectivity = sentiment(tweet)
    if polarity >= 0.30:
        y_pred.append(1)
    elif polarity >= -0.70:
        y_pred.append(0)
    else:
        y_pred.append(-1)

In [78]:
len(y_pred), len(X),len(y)

(6353, 6353, 6353)

In [79]:
print classification_report(y, y_pred)

             precision    recall  f1-score   support

         -1       0.54      0.03      0.06       987
          0       0.57      0.84      0.68      3292
          1       0.56      0.38      0.45      2074

avg / total       0.56      0.56      0.51      6353



#### We observe that our custome classifier performs slightly better.