## Sentiment Analysis of Tweets

### Introduction

Goal: Classify tweets as positive, negative, or neutral.

### Import

In [76]:
import pandas as pd
import numpy as np
import re
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data preparation

In [77]:
path = './drive/MyDrive/Projects/data/DS_1/'

df_negative = pd.read_csv(path + 'processedNegative.csv', header=None)
df_positive = pd.read_csv(path + 'processedPositive.csv', header=None)
df_neutral = pd.read_csv(path + 'processedNeutral.csv', header=None)

In [78]:
def preprocess(df, sentiment=None):
  df = df.T
  df.columns = ['tweet']
  df['sentiment'] = sentiment
  return df

df_positive = preprocess(df_positive, 'positive')
df_negative = preprocess(df_negative, 'negative')
df_neutral = preprocess(df_neutral, 'neutral')

# Combine and shuffle
df = pd.concat([df_negative, df_positive, df_neutral], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Combined dataset shape: {df.shape}")
df.head(10)

Combined dataset shape: (3873, 2)


Unnamed: 0,tweet,sentiment
0,Mamata's dole,neutral
1,Jewish Nobel for Little stars,neutral
2,That would be a great trick happy,positive
3,Defence Partnership,neutral
4,Thanks,positive
5,,neutral
6,824000 new vehicles trashed,neutral
7,Sorry for being tired on stream. Will get some...,negative
8,much appreciated happy Want this,positive
9,they want says at victory rally.,neutral


Clean Tweets

In [79]:
def clean_text(text):
    text = text.lower()                                 # Lowercase
    text = re.sub(r'http\S+|www\S+', '', text)          # Remove URLs
    text = re.sub(r'@\w+', '', text)                    # Remove mentions
    text = re.sub(r'#\w+', '', text)                    # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text)                # Remove non-letter characters
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

df['clean_tweet'] = df['tweet'].astype(str).apply(clean_text)

Train/Test Split (with Stratification)

In [80]:
X = df['clean_tweet']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### Vectorization

BoW and TF-IDF - these vectorizers expect raw text (not token lists).

In [81]:
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

Word2Vec Vectorization expects a list of tokenized sentences (lists of words) to train on

In [82]:
X_train_tokens = [word_tokenize(tweet) for tweet in X_train]
X_test_tokens = [word_tokenize(tweet) for tweet in X_test]

w2v_model = Word2Vec(
    sentences=X_train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

# Helper function: average word vectors to get document vector
def document_vector(tokens, model):
    tokens = [word for word in tokens if word in model.wv]
    if not tokens:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

# Vectorize tweets by averaging Word2Vec vectors
X_train_w2v = np.array([document_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([document_vector(tokens, w2v_model) for tokens in X_test_tokens])

### Similarity Analysis

In [83]:
# Remove duplicate cleaned tweets to avoid trivial similarity matches
df_unique = df.drop_duplicates(subset='clean_tweet').reset_index(drop=True)

# Vectorize unique cleaned tweets using TF-IDF
vectorizer_sim = TfidfVectorizer()
X_unique_tfidf = vectorizer_sim.fit_transform(df_unique['clean_tweet'])

# Compute cosine similarity matrix
sim_matrix = cosine_similarity(X_unique_tfidf)

# Zero out diagonal to ignore self-similarity
np.fill_diagonal(sim_matrix, 0)

# Flatten and get indices of top 20 values (covers symmetric pairs)
flat_sim = sim_matrix.flatten()
top_indices = flat_sim.argsort()[-20:]

print("Top 10 most similar tweet pairs:\n")

seen = set()
count = 0

# Iterate in descending similarity order and print top 10 pairs
for idx in reversed(top_indices):
    i, j = divmod(idx, sim_matrix.shape[1])
    if i >= j:
        continue  # Skip duplicates and self-pairs
    pair = (i, j)
    if pair not in seen:
        sim_score = sim_matrix[i, j]
        print(f"Similarity: {sim_score:.4f}")
        print(f" Tweet 1: {df_unique.loc[i, 'tweet']}")
        print(f" Tweet 2: {df_unique.loc[j, 'tweet']}\n")
        seen.add(pair)
        count += 1
        if count == 10:
            break

Top 10 most similar tweet pairs:

Similarity: 1.0000
 Tweet 1:  l  Uttar Pradesh (leads/trends) 373/403 264 70 27 Others 12 
 Tweet 2: Uttar Pradesh (leads/trends) 103/for03 55 26 18 Others for 

Similarity: 1.0000
 Tweet 1: Love this!
 Tweet 2: Love you too P!

Similarity: 1.0000
 Tweet 1: thanks happy
 Tweet 2: thanks b happy

Similarity: 1.0000
 Tweet 1: I WAITED FOR YOU unhappy  
 Tweet 2: I WAITED FOR U unhappy 

Similarity: 0.9815
 Tweet 1: Hi! We tried to call your number but got no response unhappy  Please share another suitable time and an alternate number for us to.. cont1
 Tweet 2: Hi! We tried to call your number but got no response unhappy  Please share another suitable time and an alternate number for.. cont1

Similarity: 0.9632
 Tweet 1: Hey thanks for being top new followers this week! Much appreciated happy   Want this ?
 Tweet 2: Hey thanks for being top new followers this week! Much appreciated happy

Similarity: 0.9628
 Tweet 1: Thanks for the recent follow Happy to

### Machine Learning Models

In [84]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

results = []

vectorized_datasets = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf),
    "Bag-of-Words": (X_train_bow, X_test_bow),
    "Word2Vec": (X_train_w2v, X_test_w2v)
}

for vectorizer_name, (X_train_vec, X_test_vec) in vectorized_datasets.items():
    for model_name, model in models.items():
        # Skip Naive Bayes for Word2Vec embeddings (continuous features)
        if vectorizer_name == "Word2Vec" and model_name == "Naive Bayes":
            continue

        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        results.append({
            "Model": model_name,
            "Vectorizer": vectorizer_name,
            "Accuracy": round(acc, 3),
            "F1-score": round(f1, 3)
        })

In [85]:
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
results_df

Unnamed: 0,Model,Vectorizer,Accuracy,F1-score
0,SVM,TF-IDF,0.897,0.896
1,Logistic Regression,TF-IDF,0.894,0.893
2,Random Forest,TF-IDF,0.888,0.887
3,Naive Bayes,Bag-of-Words,0.885,0.885
4,Random Forest,Bag-of-Words,0.884,0.884
5,Logistic Regression,Bag-of-Words,0.883,0.882
6,SVM,Bag-of-Words,0.88,0.88
7,Naive Bayes,TF-IDF,0.879,0.879
8,Random Forest,Word2Vec,0.834,0.833
9,SVM,Word2Vec,0.773,0.772
