# PROJECT B: ML-SA vs lexicon SA


In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
# Load the dataset
dataset_path = 'tweets.csv'
tweets_data = pd.read_csv(dataset_path)
tweets_data

Unnamed: 0,Tweet ID,Text,User,Created At,Likes,Retweets,Sentiment
0,449211727471646420,Feeling grateful for my friends and family.,werickson,2023-01-13 00:35:08,156,489,positive
1,519036665081652813,Going for a walk in the park.,jennybutler,2023-02-16 06:24:30,223,788,neutral
2,776023316169815671,I hate it when things don't go my way.,william88,2023-01-24 18:12:37,332,860,negative
3,674750468135750054,I hate it when things don't go my way.,lawrencebauer,2023-02-09 07:14:24,388,881,negative
4,859726107390311299,This is the best day ever!,gerald07,2023-02-28 06:55:54,255,567,positive
...,...,...,...,...,...,...,...
995,250464848751217010,I hate it when things don't go my way.,nhayes,2023-01-28 05:03:18,986,932,negative
996,600819966000157055,I hate it when things don't go my way.,marknixon,2023-04-21 13:27:44,458,61,negative
997,966366146192109165,I'm so upset right now.,hollyflores,2023-03-08 11:29:25,317,179,negative
998,936627265507507170,Just had lunch with a friend.,odickerson,2023-04-09 18:32:54,584,706,neutral


In [5]:
# Text preprocessing function
def preprocess_text(text):
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r"http\S+|@\w+|#\w+|[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply preprocessing to the dataset
tweets_data['Cleaned_Text'] = tweets_data['Text'].apply(preprocess_text)

In [6]:
# Encode the sentiment labels into numerical format for ML
label_encoder = LabelEncoder()
tweets_data['Sentiment_Label'] = label_encoder.fit_transform(tweets_data['Sentiment'])

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    tweets_data['Cleaned_Text'],
    tweets_data['Sentiment_Label'],
    test_size=0.2,
    random_state=42
)

### ML MODELS

In [7]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train and evaluate multiple classifiers
results = {}
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "SVM": SVC(random_state=42),
    "Naive Bayes": MultinomialNB()
}

In [8]:
# Train and test each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_tfidf, y_train)
    # Predict on the test set
    y_pred = clf.predict(X_test_tfidf)
    # Evaluate and store results
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        "Model": clf,
        "Accuracy": accuracy,
        "Report": classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    }

In [9]:
# Print classification reports for each classifier
for name, result in results.items():
    print(f"Classification Report for {name}:")
    print(result["Report"])
    print("-" * 50)

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        82
     neutral       1.00      1.00      1.00        50
    positive       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

--------------------------------------------------
Classification Report for SVM:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        82
     neutral       1.00      1.00      1.00        50
    positive       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

--------------------------------------------------
Classification Report for Naive Bayes:
              preci

### VADER SENTIMENT ANALYSIS

In [10]:
# Initialize VADER sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

In [11]:
# Function to classify sentiment using VADER
def classify_vader_sentiment(text):
    scores = vader_analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return "positive"
    elif scores['compound'] <= -0.05:
        return "negative"
    else:
        return "neutral"

In [12]:
# Apply VADER to the test dataset
tweets_data['VADER_Predicted'] = tweets_data['Cleaned_Text'].apply(classify_vader_sentiment)

# Convert VADER results to match ML sentiment labels
tweets_data['VADER_Label'] = label_encoder.transform(tweets_data['VADER_Predicted'])

# Evaluate VADER sentiment analysis
vader_accuracy = accuracy_score(tweets_data['Sentiment_Label'], tweets_data['VADER_Label'])
vader_report = classification_report(tweets_data['Sentiment_Label'], tweets_data['VADER_Label'], target_names=label_encoder.classes_)

In [13]:
print("\n----- VADER Sentiment Analysis Results -----")
print(f"Accuracy: {vader_accuracy}")
print("Classification Report:")
print(vader_report)


----- VADER Sentiment Analysis Results -----
Accuracy: 0.924
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       422
     neutral       1.00      0.69      0.82       246
    positive       0.81      1.00      0.90       332

    accuracy                           0.92      1000
   macro avg       0.94      0.90      0.90      1000
weighted avg       0.94      0.92      0.92      1000



Since all models performed equally well, the choice of the best model can be based on other considerations, such as Efficiency and Scalability.

If you need a simple, fast, and effective model, go with Naive Bayes or Logistic Regression. Both are excellent choices for scalability and efficiency.

If computational power is not an issue and the dataset size is small to medium, SVM is equally viable.

On the other hand, VADER's rule-based approach is faster and easier to implement but may lack the flexibility of ML models for domain-specific data.


NAME: Mohamed Moubarak Mohamed Misbahou Mkouboi<br>
MATRIC NO: P139575<br>