### 1. Import Functions and Data

In [1]:

import nltk # Python library for NLP
from nltk.corpus import twitter_samples # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt # library for visualization
import random # pseudo-random number generator

import re # library for regular expression operations
import string # for string operations

from nltk.corpus import stopwords # module for stop words that come with NLTK
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings

import csv
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:

# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# Save the tweets to a file
with open('positive_tweets.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_positive_tweets))

with open('negative_tweets.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_negative_tweets))


In [3]:
# Randomly select three positive tweets
random_positive_tweets = random.sample(all_positive_tweets, 3)

# Randomly select three negative tweets
random_negative_tweets = random.sample(all_negative_tweets, 3)

print("Randomly selected positive tweets:")
for tweet in random_positive_tweets:
    print(tweet)

print("\nRandomly selected negative tweets:")
for tweet in random_negative_tweets:
    print(tweet)

Randomly selected positive tweets:
Looking for fun? KIK - goictived70685 #kik #kikmeguys #xxx #tagsforlikes #webcam #trapmusic #hotmusicdelocos :) http://t.co/45Rc6ZgQ2Z
PICK ME PLEASE EMPLOYERS! :) http://t.co/AhbhLTjYzY
Would love to see you dear in #Jordan :) waiting you! @FIRDOZ  :) @VisitJordan @dannyprol

Randomly selected negative tweets:
♛♛♛
》》》》 
I LOVE YOU SO MUCH.
I BELİEVE THAT HE WİLL FOLLOW.
PLEASE FOLLOW ME PLEASE JUSTİN @justinbieber :( x15.327
》》》》ＳＥＥ ＭＥ
♛♛♛
Fries please? :( @JustMeAla
andaming memorization :(


In [4]:

print(len(all_positive_tweets),all_positive_tweets[0])
print(len(all_negative_tweets),all_negative_tweets[0])


5000 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
5000 hopeless for tmr :(


### 2. Preprocessing

In [5]:

def process_tweet(tweet):

  """Process tweet function.
  Input:
  tweet: a string containing a tweet
  Output:
  tweets_clean: a list of words containing the processed tweet
  """
  stemmer = PorterStemmer( )
  stopwords_english = stopwords.words('english')
  # remove stock market tickers like $GE
  tweet = re.sub(r'\$\w*', '', tweet)
  # remove old style retweet text "RT"
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  # remove hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  # only removing the hash # sign from the word
  tweet = re.sub(r'#', '', tweet)
  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=False,
  strip_handles=True, reduce_len=True)

  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and # remove stopwords
      word not in string.punctuation): # remove punctuation

      # tweets_clean.append(word)
      stem_word = stemmer.stem(word) # stemming word
      tweets_clean.append(stem_word)

  return tweets_clean

In [6]:

# Initializing lists to store processed positive and negative tweets
pro_pos_tw = []
pro_neg_tw = []

# Processing each tweet in the list of positive tweets
for tweet in all_positive_tweets:
    # Applying the process_tweet function to preprocess the tweet
    pro_pos_tw.append(process_tweet(tweet))

# Processing each tweet in the list of negative tweets
for tweet in all_negative_tweets:
    # Applying the process_tweet function to preprocess the tweet
    pro_neg_tw.append(process_tweet(tweet))

# Printing the number of processed positive tweets and an example of the first processed positive tweet
print("Number of processed positive tweets:", len(pro_pos_tw))
print("Example of a processed positive tweet:", pro_pos_tw[0])

# Printing the number of processed negative tweets and an example of the first processed negative tweet
print("Number of processed negative tweets:", len(pro_neg_tw))
print("Example of a processed negative tweet:", pro_neg_tw[0])


Number of processed positive tweets: 5000
Example of a processed positive tweet: ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']
Number of processed negative tweets: 5000
Example of a processed negative tweet: ['hopeless', 'tmr', ':(']


In [7]:

# Shuffle positive and negative tweets
random.shuffle(pro_pos_tw)
random.shuffle(pro_neg_tw)

# Select 4000 random positive and negative tweets for training
train_pos_tw = pro_pos_tw[:4000]
train_neg_tw = pro_neg_tw[:4000]

# Select 1000 random positive and negative tweets for testing
test_pos_tw = pro_pos_tw[4000:]
test_neg_tw = pro_neg_tw[4000:]

# Combine training and testing tweets
train_tweets = train_pos_tw + train_neg_tw
test_tweets = test_pos_tw + test_neg_tw

# Create labels
train_labels = [1] * len(train_pos_tw) + [0] * len(train_neg_tw)
test_labels = [1] * len(test_pos_tw) + [0] * len(test_neg_tw)

# Checking sizes of training and testing sets
print("Training set size:", len(train_tweets))
print("Testing set size:", len(test_tweets))

# Checking distribution of labels in training and testing sets
from collections import Counter
print("Training set label distribution:", Counter(train_labels))
print("Testing set label distribution:", Counter(test_labels))


Training set size: 8000
Testing set size: 2000
Training set label distribution: Counter({1: 4000, 0: 4000})
Testing set label distribution: Counter({1: 1000, 0: 1000})


In [8]:
len(train_tweets),len(train_labels)

(8000, 8000)

In [9]:

print(train_tweets)
print(train_labels)


[['ghost', 'bae', ':d', 'love', 'tagsforlikesapp', 'instagood', 'smile', 'follow', 'cute', 'photooftheday', 'tbt', '…'], ['fnaf', '4', 'drop', '...', 'look', 'like', 'sleep', '4'], ['yeahhh', ':)'], ['never', 'listen', 'hill', 'everi', ':-)'], ['love', 'work', 'tokyo', ':)', 'kunoriforceo', 'ceo', '1month'], ['love', 'tell', 'everi', 'crowd', "they'r", 'loudest', 'loudest', 'fan', 'world', ':)', 'x'], ['ok', 'ok', ':d'], ['pleas', 'follow', 'love', 'much', 'would', 'mean', 'world', ':)'], ['follow'], ['hey', 'worri', 'pleas', 'share', 'wallet', 'detail', 'via', 'dm', 'get', 'sort', ':)'], ['hi', "here'", 'vid', 'stydia', 'take', 'look', ':)'], ['..', 'ye', 'sometim', 'pass', ':)'], ['thank', 'wish', ':)'], ['دعمم', 'للعودة', 'للحياة', 'heiyo', 'visit', 'websit', 'free', '50.000', 'coin', '8', 'ball', 'pool', 'thank', ':d'], ['done', ':)'], ['10', 'follow', 'part', 'annoy', 'twitter', 'group', 'side', 'pleas', ':)'], ['know', 'home-bas', 'offic', 'busi', 'feel', ':-)', 'piti', 'knee', '

In [10]:

for l,t in zip(train_labels,train_tweets):
  print(l,t)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1 ['easi', 'clever', 'interest', 'audienc', 'welcom', ':)']
1 ['askurban', 'ask', 'us', 'question', ':)']
1 ['6pm', 'door', 'open', 'everyon', 'vip', 'gener', 'ticket', 'get', 'seat', 'earli', 'watch', 'support', 'act', ':)']
1 ['sure', "f'ing", ':)']
1 ['ok', '...', 'sere', '2', 'play', 'yah', 'min', '..', 'haha', ':)']
1 ['ah', 'hello', 'back', 'larri', ':)']
1 ['probabl', 'easier', 'meet', 'station', 'unless', 'particularli', 'want', 'show', 'hous', ':)']
1 ['thank', 'million', 'keep', 'eye', 'tweet', 'alway', 'appreci', ':)']
1 ['love', 'relationship', ':-)']
1 ['usa', 'rosh', 'travel', ':)', 'glasgow', 'intern', 'airport', 'gla']
1 ['fav', 'awak', 'fam', ':)']
1 ['oop', '...', "that'", 'call', 'fridayfauxpa', ':)', "i'll", 'get', 'chang', 'right', 'thank', 'g']
1 ['noth', 'like', 'littl', 'team', 'photo', 'thursday', 'smile', 'even', ':)']
1 ['long', 'feel', 'comfort', 'im', 'gonna', 'wear', 'want', 'mother', 'haha',

### 3. Extract Feature

In [11]:
def build_freqs_df(tweets, ys):
    """
    Build frequencies and return as DataFrame.
    Input:
    tweets: a list of tweets
    ys: an m x 1 array with the sentiment label of each tweet
    (either 0 or 1)
    Output:
    df: DataFrame with three columns: 'Word', 'posfreq', and 'negfreq'
    """
    freqs = {}

    for y, tweet in zip(ys, tweets):
        for word in tweet:
            if word not in freqs:
                freqs[word] = {'posfreq': 0, 'negfreq': 0}
            if y == 1:
                freqs[word]['posfreq'] += 1
            else:
                freqs[word]['negfreq'] += 1

    # Convert frequency dictionary to DataFrame
    df = pd.DataFrame(freqs.items(), columns=['Word', 'Frequency'])

    # Split 'Frequency' dictionary into two columns: 'posfreq' and 'negfreq'
    df[['posfreq', 'negfreq']] = pd.DataFrame(df['Frequency'].tolist(), index=df.index)

    # Drop 'Frequency' column
    df.drop(columns=['Frequency'], inplace=True)

    return df

In [12]:
freq = build_freqs_df(train_tweets+test_tweets,train_labels+test_labels)

In [13]:
print(len(freq),type(freq),freq)

10416 <class 'pandas.core.frame.DataFrame'>                   Word  posfreq  negfreq
0                ghost        3        1
1                  bae        6       11
2                   :d      629        0
3                 love      400      152
4      tagsforlikesapp        2        0
...                ...      ...      ...
10411         creation        0        1
10412            amtir        0        1
10413        melatonin        0        1
10414              2-4        0        1
10415            stomp        0        1

[10416 rows x 3 columns]


In [14]:
freq

Unnamed: 0,Word,posfreq,negfreq
0,ghost,3,1
1,bae,6,11
2,:d,629,0
3,love,400,152
4,tagsforlikesapp,2,0
...,...,...,...
10411,creation,0,1
10412,amtir,0,1
10413,melatonin,0,1
10414,2-4,0,1


In [15]:
def calculate_lambda(df):
    """
    Calculate lambda (λ) values for each row in the dataframe.

    Input:
    df: pandas DataFrame containing columns 'Positive_Count' and 'Negative_Count'

    Output:
    df: pandas DataFrame with additional columns 'p(w,pos)', 'p(w,neg)', and 'lambda'
    """

    # Calculate p(w,pos) and p(w,neg)
    df['p(w,pos)'] = (df['posfreq'] + 1) / ( len(df) + df['posfreq'].sum() )
    df['p(w,neg)'] = (df['negfreq'] + 1) / ( len(df) + df['negfreq'].sum() )

    # Calculate lambda (λ)
    df['lambda'] = df['p(w,pos)'] / df['p(w,neg)']

    return df

In [16]:
# Apply the function to the training data
l_data = calculate_lambda(freq)
print("Training Data with Lambda Values:")
print(l_data.info())

Training Data with Lambda Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Word      10416 non-null  object 
 1   posfreq   10416 non-null  int64  
 2   negfreq   10416 non-null  int64  
 3   p(w,pos)  10416 non-null  float64
 4   p(w,neg)  10416 non-null  float64
 5   lambda    10416 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 488.4+ KB
None


### 4. Train

In [17]:
def train_naive_bayes(train_x):
    '''
    Input:
        train_x: pandas DataFrame containing columns 'Word', 'Positive_Count', 'Negative_Count', 'p(w,pos)', 'p(w,neg)', and 'lambda'
    Output:
        logprior: the log prior.
        train_x: pandas DataFrame with the log likelihood column added
    '''
    # Fill missing lambda values with 0
    train_x['lambda'].fillna(0, inplace=True)
    train_x['lambda'].replace([np.inf, -np.inf], 0, inplace=True)

    # Calculate log likelihood for each word
    train_x['log_likelihood'] = np.log(train_x['lambda'])

    logprior = 0  # Not sure if this is intended for Naive Bayes, since usually there's no logprior in the training phase.

    return logprior, train_x

In [18]:
logprior, lldata = train_naive_bayes(l_data)
print(logprior)
print(len(lldata))

0
10416


In [19]:
print(lldata)

                  Word  posfreq  negfreq  p(w,pos)  p(w,neg)      lambda  \
0                ghost        3        1  0.000090  0.000045    1.995250   
1                  bae        6       11  0.000158  0.000272    0.581948   
2                   :d      629        0  0.014250  0.000023  628.503766   
3                 love      400      152  0.009070  0.003469    2.614690   
4      tagsforlikesapp        2        0  0.000068  0.000023    2.992875   
...                ...      ...      ...       ...       ...         ...   
10411         creation        0        1  0.000023  0.000045    0.498813   
10412            amtir        0        1  0.000023  0.000045    0.498813   
10413        melatonin        0        1  0.000023  0.000045    0.498813   
10414              2-4        0        1  0.000023  0.000045    0.498813   
10415            stomp        0        1  0.000023  0.000045    0.498813   

       log_likelihood  
0            0.690769  
1           -0.541374  
2            6.

### 5. Test

In [20]:
test_tweets

[['mayb', 'garru', 'liara', 'rest', 'crew', ':d'],
 ['life', 'smile', ':)'],
 ['bestfriend', 'nice', 'friend', ':)'],
 ['thank', 'kind', 'sir', ':)'],
 ['work', 'today', 'better', 'tomorrow', ':)'],
 ['fav', 'dm', ':)'],
 ['readi', 'ge', '2015kenya', ':-)'],
 ['thank', 'follow', ':)', 'soap', 'perfect', 'eczema', 'suffer'],
 ['count', ':p', '28', 'bnte', 'hain', ';p'],
 ['shop', 'bit', ':p'],
 ['pinter', ':-)'],
 ["i'm", 'give', 'cowork', 'four', 'minut', 'call', ':)'],
 ['love', 'new', 'song', 'delta', 'rock', ':)'],
 ['hi', 'greet', 'marseil', ':)', 'nice', 'day'],
 ['get',
  'tgv',
  'pari',
  'austerlitz',
  'bloi',
  '12',
  'mile',
  'away',
  'chateau',
  'de',
  'marai',
  'get',
  'taxi',
  ':)'],
 ["i'll", 'tri', 'write', 'someth', 'dead', 'peopl', 'today', ':)'],
 ['cat', 'destroy', 'back', 'heel', 'claw', ':-)'],
 ['love', 'miss', 'hammi', 'thank', ':)'],
 ["i'm", 'glad', 'like', 'servic', 'happi', 'shop', 'us', ':)', 'dv'],
 ['move',
  'forward',
  'happi',
  'spirit',
  '

In [21]:
def naive_bayes_predict(tweet_list, logprior, loglikelihood):
    '''
    Input:
        tweet_list: a list of lists of words (processed tweets)
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        predictions: a list of probabilities for each tweet in tweet_list
    '''

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in tweet_list:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p


In [29]:
# Saving into a CSV file
lldata.to_csv('loglikelihood.csv', index=False)  # Set index=False to exclude the index from the file

In [22]:
loglikelihood = lldata.set_index('Word')['log_likelihood'].to_dict()

In [23]:
loglikelihood

{'ghost': 0.6907693812807153,
 'bae': -0.5413743000119171,
 ':d': 6.443342020106348,
 'love': 0.9611457066349037,
 'tagsforlikesapp': 1.0962344893888796,
 'instagood': 1.0962344893888796,
 'smile': 1.7428616543139321,
 'follow': 0.3708887771495806,
 'cute': -0.4907305671931621,
 'photooftheday': 0.5084478244867606,
 'tbt': -0.002377799279229984,
 '…': 0.8279705027942005,
 'fnaf': 1.0962344893888796,
 '4': 0.27205904642253026,
 'drop': -0.0976879790835549,
 '...': -0.13763184521519892,
 'look': 0.3097553690367152,
 'like': 0.019221312524231815,
 'sleep': -0.6845959093069852,
 'yeahhh': -0.4078429073873944,
 ':)': 7.079050635542821,
 'never': -0.4519028971814248,
 'listen': -0.11360343438945437,
 'hill': 0.9139129325949251,
 'everi': 0.46024572266888314,
 ':-)': 6.538652199910674,
 'work': -0.19068739791780728,
 'tokyo': 0.40308730882893434,
 'kunoriforceo': 2.1948467780569896,
 'ceo': 2.4825288505087704,
 '1month': 2.4825288505087704,
 'tell': -0.04011812726207706,
 'crowd': 0.403087308

In [24]:
def evaluate(ypred, ytrue):
    '''
    Input:
        ypred: a list of predicted values
        ytrue: a list of true labels
    Output:
        predictions: a list of predicted labels (0 for negative, 1 for positive)
        accuracy: accuracy of the predictions
    '''
    # Initialize an empty list to store predicted labels
    predictions = []

    # Iterate over each predicted value in ypred
    for pred_value in ypred:
        # If the predicted value is greater than or equal to the threshold, classify as positive (1)
        if pred_value >= 0:
            predictions.append(1)
        # Otherwise, classify as negative (0)
        else:
            predictions.append(0)

    # Calculate accuracy
    correct_predictions = sum(1 for pred, true in zip(predictions, ytrue) if pred == true)
    total_predictions = len(ytrue)
    accuracy = correct_predictions / total_predictions

    return predictions, accuracy

In [25]:
# Initialize an empty list to store predictions
ypred = []

# Iterate over each tweet in the test_tweets list
for tweet in test_tweets:
    # Use the naive_bayes_predict function to get the prediction for the tweet
    prediction = naive_bayes_predict(tweet, logprior, loglikelihood)
    ypred.append(prediction)


result,acc = evaluate(ypred,test_labels)
print(acc)

0.9975


### 6. On Unit Test

In [27]:
# New tweets to be added
tweets = [
    "i am sad.",
    "feeling :(.",
    "i am happy.",
    ":) moment."
]

ytrue = [0,0,0,1,1]
ypred = []

process_tweets = []

# Process all tweets
for tweet in tweets:
    process_tweets.append(process_tweet(tweet))

for tw in process_tweets:
  # Pass the array to the predict function
  ypred.append(naive_bayes_predict(tw, logprior, loglikelihood))

result,acc = evaluate(ypred,ytrue)

# Display the predicted labels
print("\n Predicted LogLikehood :",ypred)
print("\n Predicted labels :",result)


 Predicted LogLikehood : [-3.030899895656212, -8.95807104342339, 2.0961119373713006, 6.933571992622917]

 Predicted labels : [0, 0, 1, 1]
