In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matt7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matt7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\matt7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load data
data = pd.read_csv('twitter_training.csv')

In [3]:
data.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
# 1. Preprocessing 

In [6]:
def preprocess_text(text):
    # Check nan values
    if pd.isna(text):
        return ""

    # Tokenization & lowercasing
    tokens = word_tokenize(text.lower())
    
    # Removing punctuation & nums
    words = [word for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words 
    return ' '.join(lemmatized)

# Apply
data['Processed Tweet Content'] = data['Tweet Content'].apply(preprocess_text)


In [7]:
data.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content,Processed Tweet Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland murder


In [15]:
# 2. Vectorization

In [8]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer & transform 
tfidf_matrix = vectorizer.fit_transform(data['Processed Tweet Content'])

In [9]:
# Get feature names 
feature_names = vectorizer.get_feature_names_out()

# Preview feature names
print(feature_names[:25]) 

['aa' 'aaa' 'aaaaaaaaaaaa' 'aaaaaaaaaaaaa'
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
 'aaaaaaaaaaages' 'aaaaaaaaaages' 'aaaaaasee' 'aaaahhh' 'aadii' 'aadil'
 'aajtak' 'aall' 'aamaavpjyc' 'aarogya' 'aaron' 'aaroncarter'
 'aarongreenberg' 'aat' 'aatmanirbhar' 'aatmanirvar' 'ab' 'aback' 'abah'
 'abandon']


In [10]:
# 3. Training the Naïve Bayes Classifier

In [11]:
# Prepare your features (TF-IDF vectors) & labels (Sentiments)
X = tfidf_matrix
y = data['Sentiment']

# Split data training and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize the Multinomial Naïve Bayes classifier
model = MultinomialNB()

# Train 
model.fit(X_train, y_train)

In [13]:
# Predict sentiments test set
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.7036218785566044
              precision    recall  f1-score   support

  Irrelevant       0.93      0.41      0.57      2592
    Negative       0.64      0.88      0.74      4519
     Neutral       0.78      0.59      0.67      3596
    Positive       0.69      0.79      0.74      4230

    accuracy                           0.70     14937
   macro avg       0.76      0.67      0.68     14937
weighted avg       0.74      0.70      0.69     14937



In [14]:
# 5. Fine-tuning and Improvements

In [15]:
# Adjust parameters
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))

# Fit & transform 
tfidf_matrix = vectorizer.fit_transform(data['Processed Tweet Content'])

# Split data
X = tfidf_matrix
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Initialize the Multinomial Naïve Bayes classifier
model = MultinomialNB(alpha=0.1) 

# Train model
model.fit(X_train, y_train)

# Predict sentiments test 
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))


Accuracy: 0.911093258351744
              precision    recall  f1-score   support

  Irrelevant       0.96      0.86      0.91      2592
    Negative       0.87      0.95      0.91      4519
     Neutral       0.93      0.89      0.91      3596
    Positive       0.91      0.91      0.91      4230

    accuracy                           0.91     14937
   macro avg       0.92      0.91      0.91     14937
weighted avg       0.91      0.91      0.91     14937



In [17]:
# 6. Implementation

In [18]:
# Step 1 Preprocessing

# Load validation dataset
validation_data = pd.read_csv('twitter_validation.csv')

validation_data['Processed Tweet Content'] = validation_data['Tweet Content'].apply(preprocess_text)

In [19]:
# Step 2: Vectorize the Preprocessed Text
X_validation = vectorizer.transform(validation_data['Processed Tweet Content'])

In [20]:
# Step 3: Make Predictions
y_validation_pred = model.predict(X_validation)

In [21]:
# Step 4: Evaluate the Predictions

# True
y_validation_true = validation_data['Sentiment']

# Accuracy
accuracy_validation = accuracy_score(y_validation_true, y_validation_pred)
print(f"Validation Accuracy: {accuracy_validation}")

# Classification report
print(classification_report(y_validation_true, y_validation_pred))

Validation Accuracy: 0.981
              precision    recall  f1-score   support

  Irrelevant       0.99      0.98      0.99       172
    Negative       0.97      0.98      0.98       266
     Neutral       0.99      0.97      0.98       285
    Positive       0.97      0.99      0.98       277

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000

