# Implement sentiment analysis using logistic regression

   <div style="margin-left: 20px; margin-top:10px">
       The following steps outline the implementation of sentiment analysis using the Natural 
        Language Toolkit (NLTK) library, demonstrated with the 'Twitter_samples' from the NLTK corpus.
            <ul>
                <li>Import Necessary Libraries;</li>
                <li>Prepare the Data;</li>
                <li>Utils implemenration;</li>
                <li>Split the Data into Training and Test Sets;</li>
                <li>Model implementation;</li>
            </ul>
   </div>

## Step 1: Import Necessary Libraries.

In [1]:
import nltk  # natural language toolkit
from nltk.corpus import twitter_samples  # contains the twitter dataset
from nltk.corpus import stopwords        # stopwords of the df languages
from nltk.stem import PorterStemmer      # word stemming
from nltk.tokenize import TweetTokenizer  # Tokenizing
import numpy as np
import string
import re     # regular regression

from colorama import Fore   # coloring

! pip install colorama  





## Step 2 : Prepare the Data.

In [2]:
all_positive_tweets = twitter_samples.strings("positive_tweets.json")
all_negative_tweets = twitter_samples.strings("negative_tweets.json")


print(Fore.GREEN ,f"{all_positive_tweets[0]}")
print(Fore.RED , f" {all_negative_tweets[10]}")

[32m #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
[31m  I have a really good m&amp;g idea but I'm never going to meet them :(((


## Step 3: Utils implementation.    
   <div style="margin-left: 20px; margin-top:10px">
        The following are the functions that help with model training and evaluation.
            <ul>
                <li>Process Tweet;</li>
                <li>Word Frequencies;</li>
                <li>Sigmoid Function;</li>
                <li>Instances of feature extraction;</li>
                <li>Gradient Descent;</li>
                <li>Predict Tweet;</li>
            </ul>
    </div>

### Step 3.1: Process Tweet.

In [3]:
# Implement process_tweet(tweet)
english_stopwords = stopwords.words("english")


def process_tweet(tweet):
    '''
     - Removing handles , URLS, Hash(#) and extra space
     - Tokenization and lowercasing
     -  Removing Stopwords and punctuations , Stemming
     
    '''
    
    # Step 1
    # Removeing handles, # , Urls andextra space 
    tweet = re.sub(r'@\w+', '', tweet) # Remove handles
    tweet= re.sub(r'https?://\S+', '', tweet) # Remove URLs
    tweet= re.sub(r'#\w+', '', tweet) # Remove #
    
    # Optional: remove extra spaces that may have been left by removals
    tweet = re.sub(r'\s+', ' ',tweet).strip()
    
    #Step 2
    #Tokenization
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tokenize_tweet = tokenizer.tokenize(tweet)
    
    # Step 3 
    # Removing stopwords ,  punctuations and stemming
    stemmer = PorterStemmer()
    process_tweet = []
    
    for word in tokenize_tweet:
        if(word not in english_stopwords and word not in string.punctuation):
            p_tweet = stemmer.stem(word)
            process_tweet.append(p_tweet)
    
    
    return process_tweet

### Step 3.2: Word Frequencies.

In [4]:
# Implement build freq counter

def build_freqs(tweets , labels):
    
    freq_dict = {}
    #Convert numpy array to list
    y_labels = labels.squeeze().tolist()
    for tweet , label in zip(tweets , y_labels):
        p_tweet = process_tweet(tweet)
        for word in p_tweet:
            pair = (word , label)
            freq_dict[pair] = freq_dict.get(pair,0) + 1
            
    return freq_dict

### Step 3.3: Sigmoid Function.

In [5]:
#Implement a sigmoid function 

def sigmoid(z):
    
    res = 1 / (1 + np.exp(-z))
    
    return res


### Step 3.4: Instances of feature extraction.

In [7]:
# Implement a feature extraction

def extract_feature(tweet ,  freqs , process_tweet = process_tweet):
    
    
    X = np.zeros(3)
    
    X[0] = 1
    
    for word in process_tweet(tweet):
        X[1] += freqs.get((word , 1.0) , 0)
        X[2] += freqs.get((word , 0.0) , 0)
   
    return X

# extract_feature(tweet_1  , freqs)

# Implement a feature extractions

def extract_features(tweets, freqs , extract_feature=extract_feature):
    m = len(tweets)
    
    X = np.zeros((m,3))
    
    for i in range(m):
        X[i,:] = extract_feature(tweets[i] , freqs)
    
    return X



### Step 3.5: Gradient Descent.

In [8]:
#Gradient Descent 
def gradientDescent(X , Y , weights ,learning_rate,  num_iterations = 100):
 
    m = len(X)
    
    for i in range(num_iterations):
        # Calculate the prediction 
        Z = np.dot(X, weights)
        
        A = sigmoid(Z)
        
        #Calculate the cost function of the entire training set
        epsilon = 1e-9  # Small value to avoid the warning of the divide by 0
        cost = -1 / m * ((np.dot(Y.T , np.log(A + epsilon))) +  (np.dot((1 - Y).T , np.log(1-A +epsilon ))) )
    
        
        #Calculate the gradient
        dw = np.dot(X.T, (A-Y))
        
        #Update the weights
        
        weights = weights - learning_rate * dw
    
    
    return  weights, cost


### Step 3.6: Predict Tweet.

In [9]:
### Predict a tweet 

def predict_tweet(tweet, freqs , weights):
    
    # Extract feature of the tweet
    
    x = extract_feature(tweet , freqs)
    
    # Compute pred
    z = np.dot(x , weights)
    a = sigmoid(z)[0]
    
    if a >= 0.5:
        return 1.0
    return 0.0

## Step 4 : Split the Data into Training and Test Sets.

In [10]:
# split the data into two pieces , one for training (80%) and one for testing (20%)

train_pos = all_positive_tweets[:4000]   # 4000 tweets
train_neg = all_negative_tweets[:4000]   # 4000 tweets

test_pos = all_positive_tweets[4000:]    # 1000 tweets
test_neg = all_negative_tweets[4000:]    # 1000 tweets

# X - train - 80 %
train_x = train_pos + train_neg 

# X - test -  20% 
test_x = test_pos + test_neg


# Y - train
train_y = np.append( np.ones(((len(train_pos)),1)) , np.zeros((len(train_neg),1)) , axis = 0)

test_y = np.append( np.ones(((len(test_pos)),1)) , np.zeros((len(test_neg),1)) , axis = 0)


## Step 5 : Model implementation.

In [None]:
def model (X_train, Y_train , X_test, Y_test ,   freqs , learning_rate , num_iteration=100):
    
    
    #step 1: Feature extractions
    X_train = extract_features(X_train , freqs)
    
    # setp 2: Gradient Descent for training
#     initialize weights
    weights = np.zeros((3,1))
    weights , cost =  gradientDescent(X_train , Y_train , weights , learning_rate , num_iteration )
    
    
    # Predict of the  test tweets
    y_hat = []
    for tweet in X_test:
        y_pred = predict_tweet(tweet, freqs , weights)
        y_hat.append(y_pred)
    
        
    Y_test = Y_test.squeeze().tolist()
    
    total  = 0
    
    for i in range(len(y_hat)):
        if y_hat[i] == Y_test[i]:
            total += 1 
            
    # Calculate accuracy of the test examples
    acc_test = (total / len(y_hat)) * 100
    
    res = {
        "costs": cost,
        "Accuracy_test": acc_test,
        "weights": weights,
        "learning_rate": learning_rate, 
        "num_iterations": num_iteration
        
    }
        
    return res

In [None]:
freqs = build_freqs(train_x , train_y)


model_eval = model(train_x ,  train_y , test_x , test_y,  freqs , 1e-9 ,1500 )

In [None]:
model_eval

## STEP 6: Make predictions with an unseen tweet.

In [None]:
weights = model_eval["weights"]

In [None]:
tweet1 = "Joy in every moment! 🌟 #HappyLife"
tweet2 = "Another letdown. 😞 #Frustrated"
tweet3 = "Just what I needed, more rain. #PerfectDay"

In [None]:
predict_tweet(tweet1 ,  freqs , weights) # Correct

In [None]:
predict_tweet(tweet2 ,  freqs , weights) # Correct

In [None]:
predict_tweet(tweet3 ,  freqs , weights) #  ????

#### Comment
  <p>
      Logistic regression is a classification model that assumes a linear relationship between input 
    features and the target variable. However, it does not inherently comprehend 
    language context or the order in which words occur. Subtle variations in sentiment, such as 
    negations or sarcasm, may be overlooked.
</p>