In [None]:
# META DATA - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

    # Developer details: 
        # Name:  Harshita Jangde, Prachi Tavse, Tanisha Priya, Khushboo Mittal
        # Role: Architect
    # Version:
        # Version: V 1.0 (24 October 2024)
            # Developers: Harshita Jangde, Prachi Tavse, Tanisha Priya, Khushboo Mittal
     
     # Description: This code snippet implements various Natural Language Processing (NLP) text processing 
     # techniques, including lemmatization, stemming, count vectorization, TF-IDF, and word embeddings 
     # (Word2Vec, GloVe). The project preprocesses raw text data for analysis and modeling, enabling 
     # enhanced insights and efficient machine learning applications. Functions include transformations 
     # to reduce word forms, vectorize text, and generate word embeddings for downstream NLP tasks.

# CODE - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

    # Dependency: 
        # Environment:     
            # Python: 3.10.8
            # NLTK: 3.9.1
            # scikit-learn: 1.4.2
            # gensim: 4.3.1

        
# Importing necessary libraries for data manipulation, text processing, and NLP
import pandas as pd   # For data manipulation
import numpy as np    # For numerical operations
import nltk           # For natural language processing
from nltk.corpus import stopwords           # For removing stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer   # For stemming and lemmatization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # For text vectorization
from gensim.models import Word2Vec          # For word embeddings
from transformers import AutoTokenizer, AutoModel   # For pre-trained transformer models

# Load Twitter data from CSV file
twitter_data = pd.read_csv('../Data/twitter_data.csv')

# Remove rows with missing values in 'tweet_content' column
twitter_data.dropna(subset=['tweet_content'], inplace=True)

# Pre-processing

In [3]:
# Download necessary NLTK resources: stopwords and WordNet data for lemmatization
nltk.download('stopwords')
nltk.download('wordnet')

# Define a set of stopwords in English for filtering out common words
stop_words = set(stopwords.words('english'))

# Initialize stemmer and lemmatizer for text normalization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase and remove stopwords
    tokens = [word for word in text.lower().split() if word not in stop_words]
    
    # Apply stemming to reduce words to their root form
    stemmed = [stemmer.stem(word) for word in tokens]
    
    # Apply lemmatization to convert words to their base form
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    
    # Return the processed tokens as a single string
    return ' '.join(lemmatized)

# Apply the preprocessing function to the 'tweet_content' column
twitter_data['processed_content'] = twitter_data['tweet_content'].apply(preprocess_text)

# Display the first few rows of the dataframe to verify changes
twitter_data.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,tweetID,entity,sentiment,tweet_content,processed_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,"im get borderland murder ,"
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"come border kill all,"
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"im get borderland kill all,"
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"im come borderland murder all,"
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"im get borderland 2 murder all,"


# Count Vectorizer

In [4]:
# Initialize a CountVectorizer to convert text to a matrix of token counts
count_vectorizer = CountVectorizer()

# Fit the vectorizer on the processed text and transform it into a count matrix
count_matrix = count_vectorizer.fit_transform(twitter_data['processed_content'])

# Import csr_matrix from scipy for sparse matrix representation
from scipy.sparse import csr_matrix

# Convert the count matrix to a sparse DataFrame to save memory
# Use feature names as column headers in the DataFrame
count_sparse_df = pd.DataFrame.sparse.from_spmatrix(count_matrix, columns=count_vectorizer.get_feature_names_out())

# Display the first few rows of the sparse DataFrame
count_sparse_df.head()


Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,حبيت,خلاص,عبر,فيديو,٥υ,घरच,การออกอากาศของฉ,นจาก,ℐℓ٥,𝐍𝐄𝐖𝐒𝐔𝐏𝐃𝐀𝐓𝐄𝐒
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF (Term Frequency-Inverse Document Frequency) Vectorizer

In [5]:
# Initialize a TF-IDF Vectorizer to convert text to a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the processed text and transform it into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(twitter_data['processed_content'])

# Import csr_matrix from scipy for efficient sparse matrix handling
from scipy.sparse import csr_matrix

# Convert the TF-IDF matrix to a sparse DataFrame using CSR format
# Use feature names as column headers in the DataFrame
tfidf_sparse_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the sparse DataFrame
tfidf_sparse_df.head()

Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,حبيت,خلاص,عبر,فيديو,٥υ,घरच,การออกอากาศของฉ,นจาก,ℐℓ٥,𝐍𝐄𝐖𝐒𝐔𝐏𝐃𝐀𝐓𝐄𝐒
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Word2Vec Vectorizer

In [7]:
# Prepare the data for Word2Vec by splitting each tweet into a list of words (tokenized sentences)
sentences = [tweet.split() for tweet in twitter_data['processed_content']]

# Train a Word2Vec model on the tokenized sentences
# - vector_size: size of the word vectors
# - window: maximum distance between the current and predicted word
# - min_count: minimum word frequency to include in the vocabulary
# - workers: number of CPU cores to use
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Define a function to get the sentence vector by averaging word embeddings
def get_sentence_vector(sentence, model):
    # Split sentence into words
    words = sentence.split()
    
    # Retrieve vectors for each word if it exists in the model's vocabulary
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    # Return the average vector for the sentence; if no words have vectors, return a zero vector
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)

# Apply the function to each processed tweet to get the sentence vector representation
twitter_data['word2vec_vector'] = twitter_data['processed_content'].apply(lambda x: get_sentence_vector(x, word2vec_model))
twitter_data.head()


Unnamed: 0,tweetID,entity,sentiment,tweet_content,processed_content,word2vec_vector
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,"im get borderland murder ,","[-1.0566607, 0.5728749, -0.63472146, -0.700388..."
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"come border kill all,","[-0.34483135, 0.3098346, -0.2070818, -0.452307..."
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"im get borderland kill all,","[-1.0111482, 0.41281447, -0.48897535, -0.79711..."
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"im come borderland murder all,","[-0.76692855, 0.5205113, -0.53366506, -0.65859..."
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"im get borderland 2 murder all,","[-1.1728438, 0.61338407, -0.59754443, -0.61138..."


# Glove Vectorizer

In [None]:
# Function to load pre-trained GloVe embeddings from a file
def load_glove_model(glove_file):
    glove_model = {}
    # Open GloVe file and read line by line
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            # Split each line into the word and its vector components
            split_line = line.split()
            word = split_line[0]  # The word itself
            # Convert the remaining elements to a numpy array as the word vector
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding  # Add word and vector to the GloVe model dictionary
    return glove_model

# Specify the path to your downloaded GloVe embeddings file (100-dimensional vectors)
glove_file_path = 'glove.6B\glove.6B.100d.txt'
glove_model = load_glove_model(glove_file_path) 

# Define a function to compute the GloVe vector for an entire sentence
def get_glove_vector(sentence, model):
    words = sentence.split()  # Split sentence into words
    # Retrieve vectors for each word if it exists in the GloVe model
    word_vectors = [model[word] for word in words if word in model] #For each word, if it’s in glove_model, its embedding is retrieved.
    # Return the average vector for the sentence; if no words have vectors, return a zero vector
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)  # For 100-dimensional embeddings

# Apply the function to each processed tweet to get the sentence vector representation
twitter_data['glove_vector'] = twitter_data['processed_content'].apply(lambda x: get_glove_vector(x, glove_model))

# Display the first few rows of the dataframe with GloVe vectors included
twitter_data.head()

Unnamed: 0,tweetID,entity,sentiment,tweet_content,processed_content,word2vec_vector,glove_vector
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,"im get borderland murder ,","[-1.0566607, 0.5728749, -0.63472146, -0.700388...","[0.22772, 0.0013851999999999949, 0.3162134, -0..."
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"come border kill all,","[-0.34483135, 0.3098346, -0.2070818, -0.452307...","[-0.28461833333333336, -0.16813666666666668, 0..."
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"im get borderland kill all,","[-1.0111482, 0.41281447, -0.48897535, -0.79711...","[0.089055, 0.04352899999999999, 0.37232425, -0..."
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"im come borderland murder all,","[-0.76692855, 0.5205113, -0.53366506, -0.65859...","[0.25339875, 0.014191499999999996, 0.24798925,..."
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"im get borderland 2 murder all,","[-1.1728438, 0.61338407, -0.59754443, -0.61138...","[0.171896, 0.10174520000000001, 0.226181400000..."


# Model Training, Testing and Evaluation

In [9]:
# Import necessary libraries for model training, evaluation, and data splitting
from sklearn.model_selection import train_test_split  # For splitting data into training and test sets
from sklearn.linear_model import LogisticRegression   # For logistic regression model
from sklearn.metrics import classification_report, accuracy_score  # For model evaluation metrics
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to TF-IDF features

# Vectorize the text data using TF-IDF
# Converts processed content into a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(twitter_data['processed_content'])  # Feature matrix
y = twitter_data['sentiment']  # Target variable, assuming 'sentiment' is present in the data

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # max_iter increased to ensure convergence
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy and detailed classification report
print("Accuracy:", accuracy_score(y_test, y_pred))  # Outputs the overall accuracy
print("Classification Report:\n", classification_report(y_test, y_pred))  # Detailed report on precision, recall, f1-score


Accuracy: 0.7827333333333333
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.83      0.68      0.75      2666
    Negative       0.78      0.84      0.81      4464
     Neutral       0.81      0.74      0.77      3706
    Positive       0.74      0.83      0.78      4164

    accuracy                           0.78     15000
   macro avg       0.79      0.77      0.78     15000
weighted avg       0.79      0.78      0.78     15000

