In [1]:
# Open Drive for accessing the csv in gdrive

from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
# Access the dataset from the Gdrive.

import pandas as pd

path = '/content/drive/MyDrive/dataset/social_media_sentiment.csv'
df = pd.read_csv(path)

In [None]:
# Dataset Structure : Show the first 5 data from the dataset
df.head()

Unnamed: 0,text,label
0,tbh thissss is lit ðŸ”¥,Positive
1,omg it was normalllll ðŸ¤·,Neutral
2,tbh thisssss is fatnastic ðŸ”¥,Positive
3,ngl this is trash ðŸ˜¡,Negative
4,idk this is perfect ðŸ¥³,Positive


In [None]:
# Dataset Structure : Structural summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10000 non-null  object
 1   label   10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [None]:
# Dataset Structure :  Summary statistics of dataset
df.describe()

Unnamed: 0,text,label
count,10000,10000
unique,10000,4
top,fr this is wortsttt ðŸ˜¤,Positive
freq,1,2930


In [None]:
# Dataset Structure :  Return the number of Rows and Columns
df.shape

(10000, 2)

In [None]:
# Dataset Structure :  Counts how many times each class appears in the dataset
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Positive,2930
Negative,2850
Sarcastic,2702
Neutral,1518


**Dataset Structure Summary**

The dataset contains:

*   10,000 rows
*   2 columns:
    *   text â†’ Social media post
    *   label â†’ Sentiment category

    The text column contains short informal social media messages with slang and emojis.
    
    The label column contains four sentiment classes:

      *   Positive (2930)
      *   Negative (2850)
      *   Sarcastic (2702)
      *   Neutral (1518)


The dataset is slightly imbalanced where Neutral having fewer samples.

In [None]:
# PreProcessing of Text Data

import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):

    text = text.lower()                                     # lowercase
    text = re.sub(r"http\S+", "", text)                     # remove URLs
    text = re.sub(r"@\w+", "", text)                        # remove mentions
    text = re.sub(r"#\w+", "", text)                        # remove hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)                 # remove emojis and symbols

    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install gensim # For Word2Vec



In [None]:
# Feature Extraction using Word2Vec

from gensim.models import Word2Vec
import numpy as np

sentences = [text.split() for text in df['clean_text']]

w2v_model = Word2Vec(
    sentences,
    vector_size = 100,
    window = 5,
    min_count = 1,
    workers = 4,
    seed = 42
)

def vectorize(text):

  words = text.split()
  word_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]

  if len(word_vecs) == 0:
    return np.zeros(w2v_model.vector_size) # Return a zero vector of the correct size if no words are found

  return np.mean(word_vecs, axis=0) # Return the mean of word vectors

X = np.array([vectorize(text) for text in df['clean_text']])
y = df['label']


In [None]:
from sklearn.model_selection import train_test_split

# Split the Dataset before Training the Models

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# Model 1: Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators = 100,
    random_state = 42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

In [None]:
# Model 2: Improved Random Forest

from sklearn.ensemble import RandomForestClassifier

# Refine the Parameters for Better Accuracy
refined_rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    random_state=42
)

refined_rf.fit(X_train, y_train)

refined_y_pred_rf = refined_rf.predict(X_test)

In [None]:
# Model 3: Naive Bayes

from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

In [None]:
# Print Output of Models for Evaluation

from sklearn.metrics import accuracy_score, classification_report

print("Model 1: Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("\nModel 2: Improved Random Forest")
print("Accuracy:", accuracy_score(y_test, refined_y_pred_rf))
print(classification_report(y_test, refined_y_pred_rf))

print("\nModel 3: Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Model 1: Random Forest
Accuracy: 0.907
              precision    recall  f1-score   support

    Negative       0.85      0.91      0.88       570
     Neutral       0.80      0.89      0.84       304
    Positive       0.95      0.82      0.88       586
   Sarcastic       1.00      1.00      1.00       540

    accuracy                           0.91      2000
   macro avg       0.90      0.91      0.90      2000
weighted avg       0.91      0.91      0.91      2000


Model 2: Improved Random Forest
Accuracy: 0.9115
              precision    recall  f1-score   support

    Negative       0.86      0.92      0.89       570
     Neutral       0.80      0.91      0.85       304
    Positive       0.96      0.82      0.89       586
   Sarcastic       1.00      1.00      1.00       540

    accuracy                           0.91      2000
   macro avg       0.90      0.91      0.91      2000
weighted avg       0.92      0.91      0.91      2000


Model 3: Naive Bayes
Accuracy: 0.6215
  

# **Conclusion**
**Model Evaluation**

Accuracy Results:

  *   Random Forest: ~90.9%
  *   Improved Random Forest: ~91.1%
  *   Naive Bayes: ~62.8%
---

**Performance Comparison**

Both Random Forest Models outperformed the Naive Bayes model where the accuracy is around 91 â€“ 92%

The Improved version increased accuracy by tweaking the parameters such as:


  *   Increased number of trees
  *   Limit the tree depth
  *   Increased minimum samples per split

by tweaking this parameters, it improved the generalization of the model


Naive Bayes performed lower because it assumes indepence between features. As Word2Vec uses correlated features representation, this result to a mismatch where it doesn't model the data effectively.


---

**Suggested Improvements**

Improvements for the data:
  *   Expand slang word for better clarity
  *   Preserving sentiment-related symbols or emojis.
  *   Balance the dataset by adding more samples to underrepresented classes.
  *   Collect more labeled data to improve generalization.

For the algorithm:
  *   Use GridSearchCV for better hyperparameter tuning
  *   Try other models such as SVM or Logistic Regression
  *   Apply advanced models like BERT for better contextual understanding.

