In [None]:
# Write your names In this cell
Student_1 = "Malek Essam Mahmoud"
Student_2 = "Mohamed Tarek Abdelmohsen"

# Sentiment Analysis
In this requirement, you will implement a sentiment analyser using twitter data. We will do it using two classifiers: Logistic Regression and Naive Bayes. Your goal will be to learn how to extract features from tweets and use sklearn to train and test your classifiers.
Let's get started:

In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
from utils import process_tweet, build_freqs

In [None]:
# download the dataset from nltk
nltk.download('twitter_samples')

# stop words are common words that we don't want to include in our features
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Prepare the train and test sets

In [None]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

print(len(train_x))
print(len(test_x))

8000
2000


In [None]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


# Utility Functions
You are given a utils.py file that contains two functions.
The first one takes a tweet and preprocess it by doing cleaning, tokenization and stemming.
The second one builds a dictionary with the keys are a tuple of (word, label) and the values are the count of this tuple in the dataset.

It is preferred that you open this file and understand these functions as we will use them next.

In [None]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11337


In [None]:
# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


# Requirement 1: Logistic Regression
To predict the sentiment using logistic Regression, we need a way to transform the tweet to numberic features to be able to do the matrix multiplication of logistic regression.

## Feature Extraction
We will extract two features from the tweets:
1. The first feature is the number of positive words in a tweet
2. The second feature is the number of negative words in a tweet

### extract_features function
This function takes a tweet then preprocess it to get the words the it should use the freqs dictionary to calculate the positive feature and the negative feature. If a word is positive and its count in the freqs dictionary is 50 then the tweet positive feature should be increased by 50. If a word doesn't exist in the freqs dictionary then you can consider the count as zero.

In [None]:
def extract_features(tweet, freqs):
    '''
    Input:
        tweet: the text of a tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output:
        x: a feature vector of dimension (1,2)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    processed_tweet = process_tweet(tweet)

    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 2))

    ############################## TODO: Calculate positive and negative features ##################################

    # loop through each word in the list of words
    for word in processed_tweet:
        # increment the word count for the positive label 1
        if (word, 1) in freqs:
          x[0,0] += freqs[(word, 1)]

        # increment the word count for the negative label 0
        if (word, 0) in freqs:
          x[0,1] += freqs[(word, 0)]

    #################################################################################################################
    assert(x.shape == (1, 2))
    return x

In [None]:
# Test Your function
tmp1 = extract_features(train_x[0], freqs)

assert (tmp1 - np.array([[3020,   61]])).sum() == 0, "Feature Extraction Error"

In [None]:
def input_tweets_to_features(tweets, freqs):
    """
    This function takes the tweets as strings and extracts the features for every tweet

    Input:
    - tweets: list of strings (tweets)
    - freqs: a dictionary corresponding to the frequencies of each tuple (word, label)

    Returns:
    - X: numpy array of shape (len(tweets), 2)
    """

    X = np.zeros((len(tweets), 2))

    ################################### TODO: calculate each tweet feature vector and store it in X ###################

    for i in range(len(tweets)):
      X[i] = extract_features(tweets[i], freqs)

    ###################################################################################################################

    return X

In [None]:
X = input_tweets_to_features(train_x, freqs)
Y = train_y

print(X.shape)
print(Y.shape)

(8000, 2)
(8000, 1)


## Logistic Regression
In this part, we will use sklearn logistic regression model to train and test our logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def train_lr(X, Y):
    """
    This function trains logistic regression model

    Inputs:
    - X: training data
    - Y: labels

    Returns:
    - logistic regression model after training
    """

    lr = LogisticRegression(random_state=5) #fix random seed to ensure consistent results across diff runs

    ################################# TODO: train the lr model (hint: check fit from sklearn) #####################
     # This allow the logestic regression model to learn the relationship between features in x and the labels y
     # The model iteratively adjusts its parameters to improve its predictions
    lr.fit(X, Y)
    ###############################################################################################################

    return lr

In [None]:
lr = train_lr(X, Y.reshape(len(Y),))

assert (lr.coef_ - np.array([[0.00903432, -0.01027023]])).sum() < 1e-6, "Training Error"
assert lr.intercept_[0] - 0.99980174 < 1e-7, "Training Error"

In [None]:
def predict(clf, X):
    """
    This function takes a classification model and input features to predict their labels

    Inputs:
    - clf: classifier trained by sklearn
    - X: input matrix of shape (TweetsDataCount, NFeatures)

    Returns:
    - Y_pred: prediction matrix of shape(TweetsDataCount,)
    """

    Y_pred = None
    ######################### TODO: predict labels (hint: check predict function from sklearn) ############################
    # Apply the patterns it learned during training to the new input data
    Y_pred = clf.predict(X)
    #######################################################################################################################
    return Y_pred

In [None]:
X_test = input_tweets_to_features(test_x, freqs)
Y_test = test_y

print(X_test.shape)
print(Y_test.shape)

(2000, 2)
(2000, 1)


In [None]:
Y_pred = predict(lr, X_test)
print(Y_pred.shape)

(2000,)


## Let's test our model

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1000
         1.0       0.99      0.99      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



# Requirement 2: Naive Bayes
As you know, naive bayes is based on words frequencies. To train a Naive Bayes classifier we need to do the following:

## 1. Bag of Words:
We need to represent each tweet with a vector of size V where V is the size of vocabulary and each entry represents the count of the word at this index. For this we will use the count vectorizer of sklearn

## 2. Train Naive Bayes:
Training the Multinomial Naive Bayes needs the matrix that represents the documents as bag of words.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def preprocess_for_CountVectorizer(corpus):
    """
    This function takes list of documents and preprocess them for CountVectorizer

    Inputs:
    - corpus: List of strings

    Returns:
    - processed_corpus: List of strings
    """
    processed_corpus = []
    ################################# TODO: implement the following steps ####################################

    # preprocess tweets (you should have a list of lists since you will call process_tweet on all tweets)


    # append the tokens of each tweet together seperating them by white space
    # As the CountVectorizer needs list of strings
    # hint: check str.join()

    for tweet in corpus:
      processed_tweet = process_tweet(tweet) # this will return the tweet as a list of words
      processed_corpus.append(' '.join(processed_tweet))

    ##########################################################################################################

    return processed_corpus

In [None]:
NB_processed_train_x = preprocess_for_CountVectorizer(train_x)
NB_processed_test_x = preprocess_for_CountVectorizer(test_x)

assert len(train_x) == len(NB_processed_train_x), "Processing Error"
assert len(test_x) == len(NB_processed_test_x), "Processing Error"
assert NB_processed_train_x[0] == 'followfriday top engag member commun week :)', "Processing Error"

In [None]:
def train_count_vectorizer(processed_train_corpus):
    """
    This function takes processed training corpus and trains a CountVectorizer

    Inputs:
    - processed_train_corpus: list of tweets

    Returns:
    - vectorizer: CountVectorizer Object after training
    """

    vectorizer = None
    ################################### TODO: Create and Fit the vectorizer ##################################
    # Create the Vectorizer
    # hint1: check CountVectorizer from sklearn
    # hint2: You will need to specify the token_pattern parameter as the default one will miss some tokens


    # fit the vectorizer
    vectorizer = CountVectorizer(token_pattern=r'([\S]+)')  # why token pattern? Default tokenization might miss certain tokens.
    vectorizer.fit(processed_train_corpus)
    ##########################################################################################################

    return vectorizer

In [None]:
BoWVectorizer = train_count_vectorizer(NB_processed_train_x)
train_x_BOW = BoWVectorizer.transform(NB_processed_train_x)
test_x_BOW = BoWVectorizer.transform(NB_processed_test_x)
assert len(BoWVectorizer.vocabulary_) == 9083, "Count Vectorizer Error"
assert train_x_BOW.shape == (8000, 9083), "Count Vectorizer Error"
assert test_x_BOW.shape == (2000, 9083), "Count Vectorizer Error"

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
def train_NB(X, Y):
    """
    This function takes the document frequency matrix (BoW matrix) and trains a MultinomialNB

    Inputs:
    - X: Document frequency matrix
    - Y: the labels
    Returns:
    - MultinomialNB Classifier
    """

    NB = None
    ############################## TODO: create and train Add-one Smoothed NB Classifer #############################
    # Create the NB classifier (hint: check MultinomialNB sklearn documentation for add-one smoothnig)
    NB = MultinomialNB(alpha = 1)

    # train
    NB.fit(X, Y)

    ##################################################################################################################
    return NB

In [None]:
NB = train_NB(train_x_BOW, train_y.reshape(len(train_y),))

assert NB.classes_[1] == 1, "NB Error"
assert NB.class_log_prior_[1] - -0.69314718 < 1e-8, "NB Error"
assert NB.class_count_[1] == 4000, "NB Error"

In [None]:
Y_pred = predict(NB, test_x_BOW)
print(Y_pred.shape)

(2000,)


# Let's test the NB model

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      1000
         1.0       1.00      0.99      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

