<a href="https://colab.research.google.com/github/IsraelAdekanye/ML_Jupyter/blob/main/Sentiment%20Analysis%20Using%20GloVe%20Vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn

from transformers import AutoTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/israel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/israel/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/israel/nltk_data...


In [None]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

tweets = all_positive_tweets + all_negative_tweets
labels = np.append(np.ones((len(all_positive_tweets), 1)), np.zeros((len(all_negative_tweets), 1)), axis=0)

In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Get list of stopwords (e.g., "the", "is", "and", etc.)
stop_words = set(stopwords.words('english'))

# Preprocessing function to clean and normalize the text
def preprocess(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  # Tokenize by spaces
    # Remove stopwords and apply stemming
    # filtered = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    filtered = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(filtered)  # Return preprocessed sentence

In [None]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_path = "../glove.6B/glove.6B.200d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

In [None]:
def vectorize_tweets(tweets, dim=200):
    vectors = [glove_embeddings[tweet] for tweet in tweets.split() if tweet in glove_embeddings]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(dim)


In [None]:
tweets = [preprocess(x) for x in tweets]
tweets_X = [vectorize_tweets(tweet) for tweet in tweets]
tweets_Y = pd.Series(np.squeeze(labels))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    tweets_X, tweets_Y, test_size=0.2, random_state=42
)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

Accuracy = accuracy_score(y_test, y_pred)
Precision = precision_score(y_test, y_pred)
Recall = recall_score(y_test, y_pred)
F1 = f1_score(y_test, y_pred)

print(Accuracy)
print(Precision)
print(Recall)
print(F1)


0.687
0.7010416666666667
0.6650197628458498
0.6825557809330629


In [None]:
# raw_tweet =  'I detest your compaany'
# processed_tweet = preprocess(raw_tweet)
# # print(processed_tweet)
# vectorized_tweet = vectorize_tweets(processed_tweet)
# print(model.predict([vectorized_tweet]))

[0.]


In [None]:
mlflow.set_experiment("First Experiment with Sentiment Analysis using Logistic Regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

with mlflow.start_run(run_name="Logistic Regression with GloVe Embeddings with Stemmmer 200d"):

    # Log evaluation metrics
    mlflow.log_metric("Accuracy", Accuracy)
    mlflow.log_metric("Precision", Precision)
    mlflow.log_metric("Recall", Recall)
    mlflow.log_metric("F1_score", F1)

    # Log model hyperparameters
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("C", model.C)
    mlflow.log_param("max_iter", model.max_iter)

    # Log embedding-specific info
    mlflow.log_param("embedding", "GloVe")
    mlflow.log_param("embedding_dim", 100)
    mlflow.log_param("vectorizer", "mean_pooling_glove")

    # Log the model
    mlflow.sklearn.log_model(model, "LogisticRegressionSentimentModel")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


🏃 View run Logistic Regression with GloVe Embeddings with Stemmmer 200d at: http://127.0.0.1:5000/#/experiments/447362716636180076/runs/6ef1ed65632d46448683e9cfdc881741
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/447362716636180076
