In [1]:
## Loading up all essential libraries
import numpy as np
import pandas as pd
import spacy
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set()

  import pandas.util.testing as tm


In [249]:
nlp = spacy.load('en_core_web_lg') ## Spacy's large vocabulary library

In [446]:
df = pd.read_csv('train.csv') ## loading up the dataset

## Pipeline

1. Run base model

### Base Model

In [6]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier

## from sklearn, we are importing all the necessary libraries we will be in need off
## for modelling,  model performance and feature exrraction

In [7]:
## more models to test being imported
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

In [8]:
from collections import Counter ## will be used to count the number of occurrances of each Part of speech
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer ## shorten words to its root according to the word's part of speech
from nltk.corpus import wordnet ## this library will allow us to find most similar word of a particular word

normalizer = WordNetLemmatizer()
def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word) ## finding word that is most similar (synonyms) for semantic reasoning
    pos_counts = Counter() # instantiating our counter class
    
    ## finding part of speech of word if part of speech is either noun, verb, adjective etc and add it up in a list
    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0] ## will extract the most likely part of speech from the list
    return most_likely_part_of_speech

#def preprocess_text(text):
   # cleaned = re.sub(r'\W+', '', text).lower()
   # tokenized = word_tokenize(cleaned)
   # normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized if len(token) > 1]
   # return normalized

In [9]:
from nltk.tokenize import TweetTokenizer ## we will use this custome TweetTokenizer from nltk to tokenize our text
import string ## from the string class, we will use its built in methods to find punctuations and numeric tokens
punctuation= list(string.punctuation)

In [251]:
## negation
negations = ['aint', 'ain\'t', 'cannot', 'cant', 'can\'t', 'darent', 'didnt', 'didn\'t', 'doesn\'t',
'doesnt', 'dont', 'don\'t', 'hadnt', 'hadn\'t','hardly', 'hasnt', 'hasn\'t', 'havent', 'haven\'t', 'havnt', 'havn\'t',
 'isnt', 'isn\'t', 'lack', 'lacking', 'lacks', 'neither', 'never', 'no', 'nobody', 'none', 'nor', 'not', 
 'nothing', 'nowhere', 'mightnt', 'mustnt' , 'neednt', 'oughtnt', 'shant', 'shouldnt', 'wasnt', 'wasn\'t', 'shouldn\'t',
 'without', 'wouldnt', 'wouldn\'t', '*n’t', 'doesn’t', 'hasn’t', 'wouldn’t', 'shouldn’t', 'wasn’t', 'can’t',
 'don’t', 'didn’t', 'ain’t', 'don\'t', 'won\'t']
def negation(df):
    b = df
    for i in negations:
        if i in df:
            b =  re.sub(i, 'not', df)
    return b
            
### some words that exist in stopwords add to the semantic meaning of a sentence. We were going to use this list of negation words
## to add more semantic and consistent meaning to our tweets by turning each negation word to not. This slightly decreased
## model performance so we will not use it

In [447]:
#df['message'] = df['message'].apply(negation) ## neglect using negation
def sentiment_changer(df):
    df['message'] = df['message'].apply(lambda x: x.replace('global', 'negative'))
    df['message'] = df['message'].apply(lambda x: x.replace('climate', 'positive'))
    df['message'] = df['message'].apply(lambda x: x.replace('MAGA', 'negative'))
    
    return df['message'] ## change sentiment to allow bigrams to carry more keyword semantic meaning. Eg 'global warming' - 'negative warming'

In [448]:
df['message'] = sentiment_changer(df)

In [449]:
## create a fiunction that cleans up tweets
def clean(df):
    df['token'] = df['message'].apply(TweetTokenizer().tokenize) ## first we tokenize
    df['punc'] = df['token'].apply(lambda x : [i for i in x if i not in punctuation])## remove punctuations
    #df['punc'] = df['token'].apply(lambda x : [i.replace('...', '') for i in x])
    #df['punc'] = df['token'].apply(lambda x : [i.replace('..', '') for i in x])
    df['dig'] = df['punc'].apply(lambda x: [i for i in x if i not in list(string.digits)]) ## remove digits
    df['final'] = df['dig'].apply(lambda x: [i for i in x if len(i) > 1]) ## since we not removing stopwords, remove all words with only 1 character
    df['final'] = df['final'].apply(lambda x: [normalizer.lemmatize(token, get_part_of_speech(token)) for token in x])
    #df['final'] = df['message'].apply(lambda x: [(y.lemma_, y.tag_) for y in  nlp(x)])
    ## lemmatize by way of applying part of speech
    return df['final']

In [450]:
df['final'] = clean(df)

In [451]:
df['final'] = df['final'].replace(to_replace = r"[\d-]", value = '', regex = True)
df['final'] = df['final'].replace(to_replace = "...", value = '')
df['final'] = df['final'].replace(to_replace = "..", value = '')

In [452]:
X = df['final'] ## split dataframe into features and ground truth
y = df['sentiment']

In [453]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)
## train test split. we find that a split size of 10% test data gives us the best performance

In [454]:
X_train = list(X_train.apply(' '.join)) ## create a single list of all tweets for both train and test features

In [455]:
X_test = list(X_test.apply(' '.join))

In [456]:
## how our test features looks like
X_train

['Why do some people still doubt negative warming https://t.co/bv7K6gFrgS https://t.co/CHxym9oAoD',
 'RT @Trial_Watcher1 @DrMartyFox @SheriffClarke @LouDobbs @seanhannity the negative warming scam to tax the air we breathe in order to fund',
 'RT @NatGeoChannel Watch #BeforeTheFlood right here a @LeoDiCaprio travel the world to tackle positive change https://t.co/LkDehj3tNn httÃ',
 'RT @Greenpeace What if positive change be just really big #AprilFools prank https://t.co/RKcZo4nNtf https://t.co/pUtpm0Dp4f',
 "RT @emorwee In which casually speculate that Steve Bannon know positive change will cause chaos and that's sorta what he want https://t…",
 'RT @JasonLastname After war and positive change have kill u all just hope alien visit earth and ride our rollercoasters',
 "don't believe Bella Thorne's and Tyler Posey's relationship exist just a much a Donald Trump doesn't believe negative warming do",
 'RT @theecoheroes Fantastic Beasts Our secret weapon in combat man-made positive change a

In [476]:

#linear svc  ...
#text_clf = Pipeline([('tfidf', TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, token_pattern = r'\w{1,}', strip_accents = 'ascii', ngram_range = (1, 5))), ('clf', LinearSVC(C=0.8) )])
#Logistic
#text_clf = Pipeline([('tfidf', TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, token_pattern = r'\w{1,}', strip_accents = 'unicode', ngram_range = (1, 5))), ('clf', LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1) )])
text_clf = Pipeline([('tfidf', TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))), ('clf', svm.SVC( gamma = 0.8, C = 10) )])

## The fun part

## we built a model pipeline using a tf idf vectorize to weight each word token by its level of importance and vectorize it and using a
## radial basis function SVM to train our model.
## after a bit of hyperparamater tuning, we found these paramaters to work well. An aggressive max_df removes the most common occuring words
## in the corpus. This improves our model as by looking at bag of words in our EDA, most frequent words have very little semantic meaning
## a token pattern of alphanumeric words words out best and stripping all unicode accents also improves the model
## since our average tweets are 17 words, a ngram of 1 to 2 performes best in capturing semantic meaning

## the svm paramaters were chosen because the radial basis function does a better job at splitting up which areas the different semantic lies
## better than a linear SVC. The reason could be that our classification is not binary.

In [477]:
text_clf.fit(X_train, y_train) ## fit and train data

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.3, max_features=None,
                                 min_df=5, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=0.8,
  

In [478]:
predictions = text_clf.predict(X_test) ## make predictions on test set

In [479]:
print(confusion_matrix(y_test, predictions)) ## confusion matrix showing magnitude of true positives and false negatives

[[ 64  17  43   2]
 [ 13  96 105  10]
 [  4  35 813  43]
 [  1   6  59 271]]


In [480]:
##after
print(classification_report(y_test, predictions)) ## here we are intrsted in the weighted f1 score

              precision    recall  f1-score   support

          -1       0.78      0.51      0.62       126
           0       0.62      0.43      0.51       224
           1       0.80      0.91      0.85       895
           2       0.83      0.80      0.82       337

    accuracy                           0.79      1582
   macro avg       0.76      0.66      0.70      1582
weighted avg       0.78      0.79      0.78      1582



In [407]:
##before
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          -1       0.78      0.51      0.62       126
           0       0.62      0.43      0.51       224
           1       0.80      0.91      0.85       895
           2       0.83      0.80      0.82       337

    accuracy                           0.79      1582
   macro avg       0.76      0.66      0.70      1582
weighted avg       0.78      0.79      0.78      1582



###### Notes

ascii might work better as a strip accent

### Test

#### clean test data

In [432]:
df_test = pd.read_csv('test.csv') ## load unseen test set

In [433]:
df_test['message'] = sentiment_changer(df_test)

In [434]:
#df['final'] = df['final'].replace(to_replace = r"[\d-]", value = '', regex = True)

In [435]:
df_test['final'] = clean(df_test) ## apply same feature cleaning as with the training set

In [436]:
df['final'] = df['final'].replace(to_replace = r"[\d-]", value = '', regex = True)
df['final'] = df['final'].replace(to_replace = "...", value = '')
df['final'] = df['final'].replace(to_replace = "..", value = '')

In [437]:
df_test['final'] = list(df_test['final'].apply(' '.join)) ## create a list of our test set

##### make predictions

In [157]:
df

Unnamed: 0,sentiment,message,tweetid,token,punc,dig,final
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car...","[PolySciMajor, EPA, chief, doesn't, think, car..."
1,1,It's not like we lack evidence of anthropogeni...,126103,"[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth...","[It's, not, like, we, lack, evidence, of, anth..."
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[RT, @RawStory, :, Researchers, say, we, have,...","[RT, @RawStory, Researchers, say, we, have, th...","[RT, @RawStory, Researchers, say, we, have, th...","[RT, @RawStory, Researchers, say, we, have, th..."
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#TodayinMaker, #, WIRED, :, 2016, was, a, piv...","[#TodayinMaker, WIRED, 2016, was, a, pivotal, ...","[#TodayinMaker, WIRED, 2016, was, a, pivotal, ...","[#TodayinMaker, WIRED, 2016, be, pivotal, year..."
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[RT, @SoyNovioDeTodas, :, It's, 2016, ,, and, ...","[RT, @SoyNovioDeTodas, It's, 2016, and, a, rac...","[RT, @SoyNovioDeTodas, It's, 2016, and, a, rac...","[RT, @SoyNovioDeTodas, It's, 2016, and, racist..."
...,...,...,...,...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001,"[RT, @ezlusztig, :, They, took, down, the, mat...","[RT, @ezlusztig, They, took, down, the, materi...","[RT, @ezlusztig, They, took, down, the, materi...","[RT, @ezlusztig, They, take, down, the, materi..."
15815,2,RT @washingtonpost: How positive change could ...,17856,"[RT, @washingtonpost, :, How, positive, change...","[RT, @washingtonpost, How, positive, change, c...","[RT, @washingtonpost, How, positive, change, c...","[RT, @washingtonpost, How, positive, change, c..."
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248,"[notiven, :, RT, :, nytimesworld, :, What, doe...","[notiven, RT, nytimesworld, What, does, Trump,...","[notiven, RT, nytimesworld, What, does, Trump,...","[notiven, RT, nytimesworld, What, do, Trump, a..."
15817,-1,RT @sara8smiles: Hey liberals the positive cha...,819732,"[RT, @sara8smiles, :, Hey, liberals, the, posi...","[RT, @sara8smiles, Hey, liberals, the, positiv...","[RT, @sara8smiles, Hey, liberals, the, positiv...","[RT, @sara8smiles, Hey, liberal, the, positive..."


In [438]:
test_predictions = text_clf.predict(df_test['final'])

In [439]:
test_predictions

array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

In [440]:
df_test['sentiment'] = test_predictions

In [441]:
df_test['message'][28]

"RT @insidepositive: Gov. Jerry Brown warns Trump that California won't back down on positive change https://t.co/CO0p9PU3Fd"

In [442]:
df_test.drop(['message', 'punc', 'token', 'dig', 'final'], axis = 1, inplace=True)

In [443]:
df_test.to_csv('base.csv', index=False)

In [444]:
d = pd.read_csv('base.csv')

In [445]:
d['sentiment'].value_counts()

 1    6540
 2    2443
 0    1048
-1     515
Name: sentiment, dtype: int64