1. Naive Bayes:
   - Naive Bayes is based on Bayes' theorem, which calculates the probability of a class given some features using conditional probability.
   - The "naive" assumption is that all features are independent given the class label.
   - Formula:
     P(y|x_1, x_2, ..., x_n) = (P(y) * P(x_1|y) * P(x_2|y) * ... * P(x_n|y)) / (P(x_1) * P(x_2) * ... * P(x_n))

2. Log Likelihood:
   - Likelihood is the probability of the observed data given a model.
   - Log likelihood is the logarithm of the likelihood function.
   - It's often used in maximum likelihood estimation.
   - Formula depends on the model; for example, in linear regression:
     log L(θ|y) = -(n/2) * log(2πσ^2) - (1/(2σ^2)) * Σ(y_i - θx_i)^2


### 1. Import Functions and Data

In [None]:

import nltk # Python library for NLP
from nltk.corpus import twitter_samples # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt # library for visualization
import random # pseudo-random number generator

import re # library for regular expression operations
import string # for string operations

from nltk.corpus import stopwords # module for stop words that come with NLTK
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings

import csv
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# Save the tweets to a file
with open('positive_tweets.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_positive_tweets))

with open('negative_tweets.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_negative_tweets))


In [None]:
# Randomly select three positive tweets
random_positive_tweets = random.sample(all_positive_tweets, 3)

# Randomly select three negative tweets
random_negative_tweets = random.sample(all_negative_tweets, 3)

print("Randomly selected positive tweets:")
for tweet in random_positive_tweets:
    print(tweet)

print("\nRandomly selected negative tweets:")
for tweet in random_negative_tweets:
    print(tweet)

Randomly selected positive tweets:
@lorainekateyumi  follow @jnlazts &amp; http://t.co/RCvcYYO0Iq follow u back :)
@_sunshinehoran_ happy birthday love :)
"@zaynmalik just had a dinner with my love @Real_Liam_Payne love you babe ! :) x"

Randomly selected negative tweets:
@NiaLovelis i miss you :(
pls follow me http://t.co/stdLTH1PBS
UGH :( I THOUGHT... @camerondallas http://t.co/KrrqH4aRbw
@alyaeldeeb12345 we're all in the same feelings :( http://t.co/lzd4XIo3aM


In [None]:

print(len(all_positive_tweets),all_positive_tweets[0])
print(len(all_negative_tweets),all_negative_tweets[0])


5000 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
5000 hopeless for tmr :(


### 2. Preprocessing

In [None]:

def process_tweet(tweet):

  """Process tweet function.
  Input:
  tweet: a string containing a tweet
  Output:
  tweets_clean: a list of words containing the processed tweet
  """
  stemmer = PorterStemmer( )
  stopwords_english = stopwords.words('english')
  # remove stock market tickers like $GE
  tweet = re.sub(r'\$\w*', '', tweet)
  # remove old style retweet text "RT"
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  # remove hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  # only removing the hash # sign from the word
  tweet = re.sub(r'#', '', tweet)
  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=False,
  strip_handles=True, reduce_len=True)

  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and # remove stopwords
      word not in string.punctuation): # remove punctuation

      # tweets_clean.append(word)
      stem_word = stemmer.stem(word) # stemming word
      tweets_clean.append(stem_word)

  return tweets_clean

In [None]:

# Initializing lists to store processed positive and negative tweets
pro_pos_tw = []
pro_neg_tw = []

# Processing each tweet in the list of positive tweets
for tweet in all_positive_tweets:
    # Applying the process_tweet function to preprocess the tweet
    pro_pos_tw.append(process_tweet(tweet))

# Processing each tweet in the list of negative tweets
for tweet in all_negative_tweets:
    # Applying the process_tweet function to preprocess the tweet
    pro_neg_tw.append(process_tweet(tweet))

# Printing the number of processed positive tweets and an example of the first processed positive tweet
print("Number of processed positive tweets:", len(pro_pos_tw))
print("Example of a processed positive tweet:", pro_pos_tw[0])

# Printing the number of processed negative tweets and an example of the first processed negative tweet
print("Number of processed negative tweets:", len(pro_neg_tw))
print("Example of a processed negative tweet:", pro_neg_tw[0])


Number of processed positive tweets: 5000
Example of a processed positive tweet: ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']
Number of processed negative tweets: 5000
Example of a processed negative tweet: ['hopeless', 'tmr', ':(']


In [None]:

# Shuffle positive and negative tweets
random.shuffle(pro_pos_tw)
random.shuffle(pro_neg_tw)

# Select 4000 random positive and negative tweets for training
train_pos_tw = pro_pos_tw[:4000]
train_neg_tw = pro_neg_tw[:4000]

# Select 1000 random positive and negative tweets for testing
test_pos_tw = pro_pos_tw[4000:]
test_neg_tw = pro_neg_tw[4000:]

# Combine training and testing tweets
train_tweets = train_pos_tw + train_neg_tw
test_tweets = test_pos_tw + test_neg_tw

# Create labels
train_labels = [1] * len(train_pos_tw) + [0] * len(train_neg_tw)
test_labels = [1] * len(test_pos_tw) + [0] * len(test_neg_tw)

# Checking sizes of training and testing sets
print("Training set size:", len(train_tweets))
print("Testing set size:", len(test_tweets))

# Checking distribution of labels in training and testing sets
from collections import Counter
print("Training set label distribution:", Counter(train_labels))
print("Testing set label distribution:", Counter(test_labels))


Training set size: 8000
Testing set size: 2000
Training set label distribution: Counter({1: 4000, 0: 4000})
Testing set label distribution: Counter({1: 1000, 0: 1000})


In [None]:
len(train_tweets),len(train_labels)

(8000, 8000)

In [None]:

print(train_tweets[10])
print(train_labels[10])


['hello', ':)', 'get', 'youth', 'job', 'opportun', 'follow']
1


In [None]:
count=0
for l,t in zip(train_labels,train_tweets):
  print(l,t)

  count += 1
  if(count>10):
    break

1 ['nice', 'dave', ':d']
1 ['okay', 'son', '4:13', 'cant', 'sleep', 'bc', 'insomnia', 'forgot', 'take', 'sleep', 'medic', ':)']
1 ['stat', 'week', 'arriv', '1', 'new', 'follow', 'unfollow', ':)', 'via']
1 [':)']
1 ['see', 'saturday', ':p', "i'll", 'see', 'stormi', ':d']
1 ['heeeyyy', 'follow', 'fan', 'account', 'thank', ':)']
1 ['fun', ':p']
1 ['follow']
1 ['snapchat', 'sexyjudi', '19', 'snapchat', 'kikmeboy', 'tagsforlik', 'pussi', 'gay', 'indiemus', 'sexo', ':)']
1 ['thank', 'mom', ':)']
1 ['hello', ':)', 'get', 'youth', 'job', 'opportun', 'follow']


### 3. Extract Feature

In [None]:
def build_freqs_df(tweets, ys):
    """
    Build frequencies and return as DataFrame.
    Input:
    tweets: a list of tweets
    ys: an m x 1 array with the sentiment label of each tweet
    (either 0 or 1)
    Output:
    df: DataFrame with three columns: 'Word', 'posfreq', and 'negfreq'
    """
    freqs = {}

    for y, tweet in zip(ys, tweets):
        for word in tweet:
            if word not in freqs:
                freqs[word] = {'posfreq': 0, 'negfreq': 0}
            if y == 1:
                freqs[word]['posfreq'] += 1
            else:
                freqs[word]['negfreq'] += 1

    # Convert frequency dictionary to DataFrame
    df = pd.DataFrame(freqs.items(), columns=['Word', 'Frequency'])

    # Split 'Frequency' dictionary into two columns: 'posfreq' and 'negfreq'
    df[['posfreq', 'negfreq']] = pd.DataFrame(df['Frequency'].tolist(), index=df.index)

    # Drop 'Frequency' column
    df.drop(columns=['Frequency'], inplace=True)

    return df

In [None]:
freq = build_freqs_df(train_tweets+test_tweets,train_labels+test_labels)

In [None]:
print(len(freq),type(freq),freq)

10416 <class 'pandas.core.frame.DataFrame'>            Word  posfreq  negfreq
0          nice       98       19
1          dave        5        0
2            :d      629        0
3          okay       39       38
4           son        4        1
...         ...      ...      ...
10411     bench        0        1
10412   analyst        0        1
10413   expedia        0        1
10414   bellevu        0        1
10415  hard-wir        0        1

[10416 rows x 3 columns]


In [None]:
freq

Unnamed: 0,Word,posfreq,negfreq
0,nice,98,19
1,dave,5,0
2,:d,629,0
3,okay,39,38
4,son,4,1
...,...,...,...
10411,bench,0,1
10412,analyst,0,1
10413,expedia,0,1
10414,bellevu,0,1


In [None]:
def calculate_lambda(df):
    """
    Calculate lambda (λ) values for each row in the dataframe.

    Input:
    df: pandas DataFrame containing columns 'Positive_Count' and 'Negative_Count'

    Output:
    df: pandas DataFrame with additional columns 'p(w,pos)', 'p(w,neg)', and 'lambda'
    """

    # Calculate p(w,pos) and p(w,neg)
    df['p(w,pos)'] = (df['posfreq'] + 1) / ( len(df) + df['posfreq'].sum() )
    df['p(w,neg)'] = (df['negfreq'] + 1) / ( len(df) + df['negfreq'].sum() )

    # Calculate lambda (λ)
    df['lambda'] = df['p(w,pos)'] / df['p(w,neg)']

    return df

In [None]:
# Apply the function to the training data
l_data = calculate_lambda(freq)
print("Training Data with Lambda Values:")
print(l_data.info())

Training Data with Lambda Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Word      10416 non-null  object 
 1   posfreq   10416 non-null  int64  
 2   negfreq   10416 non-null  int64  
 3   p(w,pos)  10416 non-null  float64
 4   p(w,neg)  10416 non-null  float64
 5   lambda    10416 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 488.4+ KB
None


### 4. Train

In [None]:
def train_naive_bayes(train_x):
    '''
    Input:
        train_x: pandas DataFrame containing columns 'Word', 'Positive_Count', 'Negative_Count', 'p(w,pos)', 'p(w,neg)', and 'lambda'
    Output:
        logprior: the log prior.
        train_x: pandas DataFrame with the log likelihood column added
    '''
    # Fill missing lambda values with 0
    train_x['lambda'].fillna(0, inplace=True)
    train_x['lambda'].replace([np.inf, -np.inf], 0, inplace=True)

    # Calculate log likelihood for each word
    train_x['log_likelihood'] = np.log(train_x['lambda'])

    logprior = 0  # Not sure if this is intended for Naive Bayes, since usually there's no logprior in the training phase.

    return logprior, train_x

In [None]:
logprior, lldata = train_naive_bayes(l_data)
print(logprior)
print(len(lldata))

0
10416


In [None]:
print(lldata)

           Word  posfreq  negfreq  p(w,pos)  p(w,neg)      lambda  \
0          nice       98       19  0.002239  0.000453    4.938244   
1          dave        5        0  0.000136  0.000023    5.985750   
2            :d      629        0  0.014250  0.000023  628.503766   
3          okay       39       38  0.000905  0.000884    1.023205   
4           son        4        1  0.000113  0.000045    2.494063   
...         ...      ...      ...       ...       ...         ...   
10411     bench        0        1  0.000023  0.000045    0.498813   
10412   analyst        0        1  0.000023  0.000045    0.498813   
10413   expedia        0        1  0.000023  0.000045    0.498813   
10414   bellevu        0        1  0.000023  0.000045    0.498813   
10415  hard-wir        0        1  0.000023  0.000045    0.498813   

       log_likelihood  
0            1.597010  
1            1.789382  
2            6.443342  
3            0.022940  
4            0.913913  
...               ...  
104

### 5. Test

In [None]:
test_tweets[:10]

[['dear',
  'person',
  'pleas',
  'studi',
  'embarrass',
  'urself',
  'entropi',
  'work',
  '100',
  'w',
  'evolut',
  ':)'],
 ['u',
  'cant',
  'chang',
  'peopl',
  'feel',
  'u',
  'dnt',
  'tri',
  'live',
  'ur',
  'life',
  'happi',
  ':)'],
 ['stat', 'week', 'arriv', '1', 'new', 'follow', 'unfollow', ':)', 'via'],
 ['mom', ':)', 'horror', 'movi'],
 ['oley', ':d'],
 ['contestkiduniya', 'hope', 'win', ':)'],
 ['jummah', 'mubarak', 'rememb', 'prayr', ':)'],
 ['done', 'yein', ':)'],
 ["i'm", 'glad', ':d', 'cri', '__'],
 ['need', 'find', 'boy', 'love', 'firebal', 'much', ':)']]

In [None]:
def naive_bayes_predict(tweet_list, logprior, loglikelihood):
    '''
    Input:
        tweet_list: a list of lists of words (processed tweets)
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        predictions: a list of probabilities for each tweet in tweet_list
    '''

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in tweet_list:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p


In [None]:
# Saving into a CSV file
lldata.to_csv('loglikelihood.csv', index=False)  # Set index=False to exclude the index from the file

In [None]:
loglikelihood = lldata.set_index('Word')['log_likelihood'].to_dict()

In [None]:
count = 0
for key, value in loglikelihood.items():
    if count < 10:
        print(key, ':', value)
        count += 1
    else:
        break

nice : 1.597009777301369
dave : 1.789381669948825
:d : 6.443342020106348
okay : 0.022940008705059904
son : 0.9139129325949251
4:13 : 0.6907693812807153
cant : -1.1009900879473398
sleep : -0.6845959093069852
bc : -1.63161833900951
insomnia : 0.6907693812807153


In [None]:
def evaluate(ypred, ytrue):
    '''
    Input:
        ypred: a list of predicted values
        ytrue: a list of true labels
    Output:
        predictions: a list of predicted labels (0 for negative, 1 for positive)
        accuracy: accuracy of the predictions
    '''
    # Initialize an empty list to store predicted labels
    predictions = []

    # Iterate over each predicted value in ypred
    for pred_value in ypred:
        # If the predicted value is greater than or equal to the threshold, classify as positive (1)
        if pred_value >= 0:
            predictions.append(1)
        # Otherwise, classify as negative (0)
        else:
            predictions.append(0)

    # Calculate accuracy
    correct_predictions = sum(1 for pred, true in zip(predictions, ytrue) if pred == true)
    total_predictions = len(ytrue)
    accuracy = correct_predictions / total_predictions

    return predictions, accuracy

In [None]:
# Initialize an empty list to store predictions
ypred = []

# Iterate over each tweet in the test_tweets list
for tweet in test_tweets:
    # Use the naive_bayes_predict function to get the prediction for the tweet
    prediction = naive_bayes_predict(tweet, logprior, loglikelihood)
    ypred.append(prediction)


result,acc = evaluate(ypred,test_labels)
print(acc)

0.997


### 6. On Unit Test

In [None]:
# New tweets to be added
tweets = [
    "i am sad.",
    "feeling :(.",
    "i am happy.",
    ":) moment."
]

ytrue = [0,0,0,1,1]
ypred = []

process_tweets = []

# Process all tweets
for tweet in tweets:
    process_tweets.append(process_tweet(tweet))

for tw in process_tweets:
  # Pass the array to the predict function
  ypred.append(naive_bayes_predict(tw, logprior, loglikelihood))

result,acc = evaluate(ypred,ytrue)

# Display the predicted labels
print("\n Predicted LogLikehood :",ypred)
print("\n Predicted labels :",result)


 Predicted LogLikehood : [-3.030899895656212, -8.95807104342339, 2.0961119373713006, 6.933571992622917]

 Predicted labels : [0, 0, 1, 1]
