# Task 2 - build the best classifier model of sentiment analysis of your dataset


In [1]:
# Import libraries
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
# Load the dataset
df = pd.read_csv("tweets.csv")
df

Unnamed: 0,Tweet ID,Text,User,Created At,Likes,Retweets,Sentiment
0,449211727471646420,Feeling grateful for my friends and family.,werickson,2023-01-13 00:35:08,156,489,positive
1,519036665081652813,Going for a walk in the park.,jennybutler,2023-02-16 06:24:30,223,788,neutral
2,776023316169815671,I hate it when things don't go my way.,william88,2023-01-24 18:12:37,332,860,negative
3,674750468135750054,I hate it when things don't go my way.,lawrencebauer,2023-02-09 07:14:24,388,881,negative
4,859726107390311299,This is the best day ever!,gerald07,2023-02-28 06:55:54,255,567,positive
...,...,...,...,...,...,...,...
995,250464848751217010,I hate it when things don't go my way.,nhayes,2023-01-28 05:03:18,986,932,negative
996,600819966000157055,I hate it when things don't go my way.,marknixon,2023-04-21 13:27:44,458,61,negative
997,966366146192109165,I'm so upset right now.,hollyflores,2023-03-08 11:29:25,317,179,negative
998,936627265507507170,Just had lunch with a friend.,odickerson,2023-04-09 18:32:54,584,706,neutral


In [3]:
# Adjust to match your column names
df = df[['Text', 'Sentiment']].dropna()

In [4]:
# Preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove user @ mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = text.strip()
    return text

In [5]:
# Preprocess the text
df['Text'] = df['Text'].apply(preprocess_text)

In [6]:
# Convert sentiment labels to numeric
df['Sentiment'] = df['Sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

# Split the data
X = df['Text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Feature extraction: Bag of Words (BoW)
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Train Naive Bayes Classifier on BoW features
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)
y_pred_bow = nb_bow.predict(X_test_bow)

In [8]:
# Feature extraction: TFIDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Naive Bayes Classifier on TFIDF features
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

In [9]:
# Evaluate models
print("Performance on Bag of Words (BoW) features:")
print(classification_report(y_test, y_pred_bow))

print("\nPerformance on TFIDF features:")
print(classification_report(y_test, y_pred_tfidf))

Performance on Bag of Words (BoW) features:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        82
           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Performance on TFIDF features:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        82
           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [13]:
# Comparison and Best Model
bow_report = classification_report(y_test, y_pred_bow, output_dict=True)
tfidf_report = classification_report(y_test, y_pred_tfidf, output_dict=True)

if tfidf_report['weighted avg']['f1-score'] > bow_report['weighted avg']['f1-score']:
    best_model = "TFIDF"
elif tfidf_report['weighted avg']['f1-score'] < bow_report['weighted avg']['f1-score']:
    best_model = "BoW"
else:
    best_model = "Tie"
print(f"The best model is based on {best_model}.")

The best model is based on Tie.


Since the performance is a tie, the decision between BoW and TFIDF should depend on the specific use case. TFIDF is preferred for larger, more complex datasets, while BoW is favored for simpler, faster implementations with smaller datasets.


NAME: Mohamed Moubarak Mohamed Misbahou Mkouboi<br>
MATRIC NO: P139575<br>