# Sentiment Analysis

## Imports

In [19]:
import sys
import os
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (precision_score, 
                             recall_score, 
                             accuracy_score, 
                             classification_report, 
                             confusion_matrix)

We need some code that is inside the parent directory of where we are. Inorder for import to work I will add the
parent directory to the system path

In [20]:
os.getcwd()

'/home/hat/dev-env/10Acadamy/week_0/Twitter-Data-Analysis/notebooks'

In [21]:
cwd = os.getcwd()
parent = cwd.replace('/notebooks', '')
sys.path.insert(0, parent) 

In [22]:
from extract_dataframe import read_json
from extract_dataframe import TweetDfExtractor
from clean_tweets_dataframe import CleanTweets
from tweets_preprocess import SADataPreparation
from utils import DataLoader

## Data Preparation

Here, what we have are tweets. I will be considering each tweet as a document.
Let's follow the following steps:
- Read the cached CSV
- Use the `SADataPreparation` class to prepare the data

In [23]:
filename = 'processed_tweet_data.csv'
loader = DataLoader('../', filename)
df = loader.read_csv()
# print(df.shape)
# print(df.columns)
# print(df.info())

## Data Cleaner

We have imported the `CleanTweets` class for this task. We will use the `run_pipeline` method to automate the cleaning

In [24]:
cleaner = CleanTweets()
cleaned_df = cleaner.run_pipeline(df, save_csv=True)
# cleaned_df.info()

Automation in Action...!!!


## Data Preparation

The data needs to be in an X ---> Y format.
The X would be the clean_text and Y would be a label calculated 
from polarity and subjectivity

This is exactly what the `SADataPreparation` class will do for us.


In [25]:
# Get the train test split of the labeled data 
X_train, X_test, y_train, y_test = SADataPreparation().prepare_features(cleaned_df, drop_neutral=False)
len(X_train), len(y_train), len(X_test), len(y_test)

Pre-Processing the Tweets


(8531, 8531, 2133, 2133)

After getting the labled data we will vectorize it in a way a model would understand. Here I will use a Count vector and a Term-Frequnecy Inverse-Document-Frequency of our data and see which is better for the task. I will also be using both a unigram and bigram vectorization of the labled data. 

In total I will have four data formats

But first we need a base mode to test with, I will use an SGDClassifer.
I am also using the methods specified in this blog: https://machinelearningmastery.com/grid-search-data-preparation-techniques/

In [26]:
def get_pipelines(base_model):
    pipelines = list()
    # Unigram count vector
    p = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,1))), 
        ('model', base_model)
    ])
    pipelines.append(('unigram_count_vectorizer', p))
    # Bigram count vector
    p = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,2))), 
        ('model', base_model)
    ])
    pipelines.append(('bigram_count_vectorizer', p))
    # Term Frequency unigram
    p = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,1))),
        ('tf_vectorize', TfidfTransformer(use_idf=False)), 
        ('model', base_model)
    ])
    pipelines.append(('unigram_tf_vectorizer', p))
    # Term Frequency bigram
    p = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,2))),
        ('tf_vectorize', TfidfTransformer(use_idf=False)), 
        ('model', base_model)
    ])
    pipelines.append(('bigram_tf_vectorizer', p))
    #  Term Frequency Inverse Document Frequecy unigram
    p = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,1))),
        ('tf_vectorize', TfidfTransformer(use_idf=True)), 
        ('model', base_model)
    ])
    pipelines.append(('unigram_tfidf_vectorizer', p))
    #  Term Frequency Inverse Document Frequecy bigram
    p = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,2))),
        ('tf_vectorize', TfidfTransformer(use_idf=True)), 
        ('model', base_model)
    ])
    pipelines.append(('bigram_tfidf_vectorizer', p))
    return pipelines

In [27]:
# base model
# base_model = MultinomialNB()
base_model = SGDClassifier()
# base_model = Perceptron()
# base_model = DecisionTreeClassifier()

# get the modeling pipelines
pipelines = get_pipelines(base_model)
# evaluate each pipeline
best_pipeline = None
max_acc = 0
for name, pipeline in pipelines:
    # train
    pipeline.fit(X_train, y_train)
    # predict
    predictions = pipeline.predict(X_test)
    # measure
    test_accuracy = mean(predictions==y_test)
    if test_accuracy > max_acc:
        max_acc = test_accuracy
        best_pipeline = (name, pipeline)
    print(f"### {name} Test mean accuracy:\t{test_accuracy}")

print(f"\n>>> The best pipeline based on the above results is\n{best_pipeline}")

best_pipeline = best_pipeline[1]

### unigram_count_vectorizer Test mean accuracy:	0.8096577590248476
### bigram_count_vectorizer Test mean accuracy:	0.8166901078293484
### unigram_tf_vectorizer Test mean accuracy:	0.8054383497421472
### bigram_tf_vectorizer Test mean accuracy:	0.8049695264885138
### unigram_tfidf_vectorizer Test mean accuracy:	0.802625410220347
### bigram_tfidf_vectorizer Test mean accuracy:	0.7984060009376465

>>> The best pipeline based on the above results is
('bigram_count_vectorizer', Pipeline(steps=[('count_vectorize', CountVectorizer(ngram_range=(1, 2))),
                ('model', SGDClassifier())]))


The bes 

In [29]:
# confusion_matrix(y_test, predicted)

In [30]:
import gc
gc.collect()

1225

# Parameter tuning using grid search


In [48]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform


sentiment_clf = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,1))), 
        ('model', SGDClassifier())
    ])


parameters = {
    # 'model__alpha': (1e-2, 1e-3),
    'model__penalty': ['l1', 'l2', 'elasticnet'],
    'model__loss': ['hinge', 'log', 'perceptron'],
    # 'model__learning_rate': ['optimal', 'invscaling', 'adaptive']
}


grid_search_cv = GridSearchCV(
    sentiment_clf,
    parameters,
    cv=5,
    n_jobs=-1
)

grid_search_cv.fit(X_train[:], y_train[:])
print(f'Best params: {grid_search_cv.best_params_}')
print(f'Best score: {grid_search_cv.best_score_}')


Best params: {'model__loss': 'log', 'model__penalty': 'l1'}
Best score: 0.8166697915142874


In [50]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, grid_search_cv.best_params_[param_name]))

model__loss: 'log'
model__penalty: 'l1'


In [72]:
# Get Vectorized Features
X_train_count, X_test_count, y_train, y_test, count_vect = SADataPreparation().vectorize_features(cleaned_df)

Pre-Processing the Tweets


## Modeling

In [73]:
class Modeling:
    def __init__(self,X_train,X_test,y_train,y_test, vectorizor):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.vectorizor = vectorizor
        self.trained_model = None

    def model(self):
        self.trained_model = SGDClassifier(loss=grid_search_cv.best_params_['model__loss'], 
                            learning_rate="optimal", 
                            penalty=grid_search_cv.best_params_['model__penalty'])
        
        # self.vectorizor.fit(self.X_train)
        # self.X_train = self.vectorizor.transform(self.X_train)
        
        self.trained_model.fit(self.X_train, self.y_train)
        
        # return clf

    def predict(self, x_test=None):
        self.model()
        if x_test is None:
            
            return self.trained_model.predict(self.X_test)
        else:
            x_vect = self.vectorizor.transform(x_test)
            
            return self.trained_model.predict(x_vect)

    def display_model_info(self):
        print(self.trained_model.show_topics(formatted=False))

    def score(self):
        train_score = self.trained_model.score(self.predict(self.X_train), self.y_train)
        test_score = self.trained_model.score(self.predict(self.X_test), self.y_test)
        print("Train score: "+str(round(train_score, 2))+" ; Validation score: "+str(round(test_score, 2)))

    def precision_recall(self):
        precision = precision_score(self.y_test, self.predict(), average='weighted')
        recall = recall_score(self.y_test, self.predict(), average='weighted')
        print(f'The precision score is {precision} and the recall score is {recall}')

There are lots of missing values in the sensitivity colum, so i will drop it

In [75]:
prepro_pipe = Pipeline([
        ('count_vectorize', CountVectorizer(ngram_range=(1,1)))
])
model_output = Modeling(X_train_count, X_test_count, y_train, y_test, count_vect)
model_output.model()
model_output.predict()[:100]

array([-1, -1,  1,  1,  0,  1,  1,  1,  0,  1,  1,  1,  0, -1,  0, -1,  1,
        1,  1,  1,  0, -1,  0,  1, -1,  0,  0,  1,  0,  1,  0,  0,  1,  0,
        1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  1,
        0,  1,  0,  1,  1,  0, -1,  0,  0,  0,  0,  1,  0,  1,  1, -1, -1,
        1, -1,  0,  0,  1,  1,  0,  1, -1,  0,  1,  1,  1,  0,  1,  0,  1,
        0,  0,  1,  1, -1,  0,  1,  0,  0,  0,  1,  0,  0,  1,  0])

In [78]:
model_output.predict(["I love this lunch"])

array([1])

In [79]:
model_output.precision_recall()

The precision score is 0.8287983320099397 and the recall score is 0.829817158931083


In [None]:
labled_df = SADataPreparation().preprocess_data(cleaned_df, False)
labled_df = labled_df[['clean_text', 'score']]
labled_df['score'].value_counts().plot.pie(figsize=(7,7), fontsize=14)

Next up: Read this blog and use some of the techniques used to find the best parametrs and get the best results.
blog link: https://towardsdatascience.com/building-a-sentiment-classifier-using-scikit-learn-54c8e7c5d2f0