In [None]:
import pandas as pd
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification,AutoModelForSequenceClassification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_twitter = pd.read_csv("/content/drive/MyDrive/Twitter_Data.csv")
df_reddit = pd.read_csv("/content/drive/MyDrive/Reddit_Data.csv")

0 : Indicating it is a Neutral Tweet/Comment

1 : Indicating a Postive Sentiment

-1 : Indicating a Negative Tweet/Comment

In [4]:
df_twitter.head()

In [5]:
df_reddit.head()

In [None]:
df_twitter.info()

In [None]:
df_reddit.info()

In [8]:
df_twitter.isnull().sum()

In [9]:
df_reddit.isnull().sum()

In [None]:
df_twitter = df_twitter.dropna()
df_reddit = df_reddit.dropna()

In [11]:
df_twitter.isnull().sum()

In [12]:
df_reddit.isnull().sum()

In [None]:
print(df_twitter.duplicated().sum())


In [None]:
print(df_reddit.duplicated().sum())

In [None]:
df_reddit = df_reddit.drop_duplicates()

In [None]:
print(df_reddit.duplicated().sum())

In [None]:
df_twitter.rename(columns = {'clean_text' : 'comment'}, inplace = True)
df_reddit.rename(columns = {'clean_comment' : 'comment'}, inplace = True)

In [18]:
df_twitter.head(1)

In [19]:
df_reddit.head(1)

In [20]:
df = pd.concat([df_twitter, df_reddit], axis = 0)
df.shape

In [None]:
# Shuffling the rows
df = df.sample(frac = 1).reset_index()

In [22]:
df

0 : Indicating it is a Neutral Tweet/Comment

1 : Indicating a Postive Sentiment

-1 : Indicating a Negative Tweet/Comment

In [None]:
plt.figure(figsize = (8, 6))
sns.countplot(x = 'category', data = df)
plt.show()

In [None]:
comments = " ".join(df['comment'].tolist())

In [None]:
wordcloud = WordCloud(width=1920, height=1080).generate(comments)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
texts = df['comment'].tolist()

In [None]:
# Train Test split
X_train, X_test, y_train, y_test = train_test_split(
    df['comment'].to_numpy(),
    df['category'].to_numpy(),
    test_size=0.1,          # 10% test set
    random_state=42,
    shuffle=True
)

#  Validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.1,          # 10% of train → validation
    random_state=42,
    shuffle=True
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

In [None]:
# Convert to flat strings (avoid nested lists)
def flatten_text_list(text_list):
    return [str(x[0]) if isinstance(x, list) else str(x) for x in text_list]

X_train = flatten_text_list(X_train)
X_val   = flatten_text_list(X_val)
X_test  = flatten_text_list(X_test)


In [None]:
# 5️⃣ Tokenizer and model settings
max_len = 32  # small length to save memory
model_name = "distilbert-base-uncased"  # smaller than full BERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
 # Batch tokenization function
def encode_texts_in_batches(texts, batch_size=50):
    input_ids_list = []
    attention_mask_list = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        enc = tokenizer(
            batch_texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
            return_tensors='tf'
        )
        input_ids_list.append(enc['input_ids'])
        attention_mask_list.append(enc['attention_mask'])
    return {
        'input_ids': tf.concat(input_ids_list, axis=0),
        'attention_mask': tf.concat(attention_mask_list, axis=0)
    }

train_encodings = encode_texts_in_batches(X_train)
val_encodings   = encode_texts_in_batches(X_val)
test_encodings  = encode_texts_in_batches(X_test)

In [None]:
# TensorFlow datasets
batch_size = 4
train_dataset = tf.data.Dataset.from_tensor_slices((train_encodings, y_train)).shuffle(100).batch(batch_size)
val_dataset   = tf.data.Dataset.from_tensor_slices((val_encodings, y_val)).batch(batch_size)
test_dataset  = tf.data.Dataset.from_tensor_slices((test_encodings, y_test)).batch(batch_size)

In [None]:
# Load the DistilBERT model
num_labels = 3
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, use_safetensors=False)

In [None]:
# Compile model
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
# Train model
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3)

In [None]:
# Evaluate on test set
test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_acc:.4f}")