# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load the datasets

In [2]:
# Load the datasets
train_data = pd.read_csv('/Users/lilianngonadi/Downloads/twitter_training.csv')
validation_data = pd.read_csv('/Users/lilianngonadi/Downloads/twitter_validation.csv')

# Display the first few rows of each dataset
train_data.head(), validation_data.head()

(   2401  Borderlands  Positive  \
 0  2401  Borderlands  Positive   
 1  2401  Borderlands  Positive   
 2  2401  Borderlands  Positive   
 3  2401  Borderlands  Positive   
 4  2401  Borderlands  Positive   
 
   im getting on borderlands and i will murder you all ,  
 0  I am coming to the borders and I will kill you...     
 1  im getting on borderlands and i will kill you ...     
 2  im coming on borderlands and i will murder you...     
 3  im getting on borderlands 2 and i will murder ...     
 4  im getting into borderlands and i can murder y...     ,
    3364   Facebook Irrelevant  \
 0   352     Amazon    Neutral   
 1  8312  Microsoft   Negative   
 2  4371      CS-GO   Negative   
 3  4433     Google    Neutral   
 4  6273       FIFA   Negative   
 
   I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy,

# Explore the data

In [3]:
# Check the structure of the datasets
train_data.info(), validation_data.info()

# Basic statistics
train_data.describe(), validation_data.describe()

# Check for missing values
train_data.isnull().sum(), validation_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                                                                                                                                                                                                                                     

(2401                                                       0
 Borderlands                                                0
 Positive                                                   0
 im getting on borderlands and i will murder you all ,    686
 dtype: int64,
 3364                                                                                                                                                                                                                                                  0
 Facebook                                                                                                                                                                                                                                              0
 Irrelevant                                                                                                                                                                                                                                    

# Data Cleaning and Preprocessing

In [4]:
# Fill missing values with empty strings
train_data.fillna("", inplace=True)
validation_data.fillna("", inplace=True)

# Combine the text columns into a single column for analysis
train_data['text'] = train_data[['Borderlands', 'im getting on borderlands and i will murder you all ,']].agg(' '.join, axis=1)
validation_data['text'] = validation_data[['Facebook', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣']].agg(' '.join, axis=1)

# Select features and labels
X_train = train_data['text']
y_train = train_data['Positive']
X_val = validation_data['text']
y_val = validation_data['Irrelevant']

# Display the cleaned and combined data
train_data[['text', 'Positive']].head(), validation_data[['text', 'Irrelevant']].head()


(                                                text  Positive
 0  Borderlands I am coming to the borders and I w...  Positive
 1  Borderlands im getting on borderlands and i wi...  Positive
 2  Borderlands im coming on borderlands and i wil...  Positive
 3  Borderlands im getting on borderlands 2 and i ...  Positive
 4  Borderlands im getting into borderlands and i ...  Positive,
                                                 text Irrelevant
 0  Amazon BBC News - Amazon boss Jeff Bezos rejec...    Neutral
 1  Microsoft @Microsoft Why do I pay for WORD whe...   Negative
 2  CS-GO CSGO matchmaking is so full of closet ha...   Negative
 3  Google Now the President is slapping Americans...    Neutral
 4  FIFA Hi @EAHelp I’ve had Madeleine McCann in m...   Negative)

# Text Preprocessing and Feature Extraction

In [5]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the validation data
X_val_tfidf = tfidf.transform(X_val)

# Display the shape of the transformed data
X_train_tfidf.shape, X_val_tfidf.shape


((74681, 5000), (999, 5000))

# Model Training XGBClassifier

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Initialize XGBoost model
xgb_model = XGBClassifier(n_estimators=100, random_state=42,  eval_metric='logloss')

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(xgb_model, X_train_tfidf, y_train_encoded, cv=cv, scoring='f1_weighted', n_jobs=-1)

# Train the XGBoost model
xgb_model.fit(X_train_tfidf, y_train_encoded)

# Predict on the validation set
y_val_pred_xgb = xgb_model.predict(X_val_tfidf)

# Decode the predicted labels back to the original labels
y_val_pred_xgb_decoded = label_encoder.inverse_transform(y_val_pred_xgb)

# Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb_decoded)
report_xgb = classification_report(y_val, y_val_pred_xgb_decoded)
conf_matrix_xgb = confusion_matrix(y_val, y_val_pred_xgb_decoded)

# Print cross-validation scores and evaluation metrics
print("Cross-validation F1 scores: ", cv_scores)
print("Mean F1 score: ", cv_scores.mean())
print("Standard deviation of F1 scores: ", cv_scores.std())
print("Validation Accuracy: ", accuracy_xgb)
print("Classification Report: \n", report_xgb)
print("Confusion Matrix: \n", conf_matrix_xgb)


Cross-validation F1 scores:  [0.66158118 0.66181373 0.67194018 0.66849752 0.66400719]
Mean F1 score:  0.6655679604650381
Standard deviation of F1 scores:  0.004040103967803809
Validation Accuracy:  0.7567567567567568
Classification Report: 
               precision    recall  f1-score   support

  Irrelevant       0.83      0.67      0.74       171
    Negative       0.69      0.88      0.78       266
     Neutral       0.79      0.68      0.73       285
    Positive       0.77      0.77      0.77       277

    accuracy                           0.76       999
   macro avg       0.77      0.75      0.75       999
weighted avg       0.76      0.76      0.76       999

Confusion Matrix: 
 [[115  23  16  17]
 [  7 235  13  11]
 [ 10  47 193  35]
 [  7  34  23 213]]
