In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))


TensorFlow version: 2.18.0
GPU available: []


In [None]:
!pip install -q tensorflow

import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth set.")
    except RuntimeError as e:
        print(e)


In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stop_words.update(["amp", "rt", "lt", "gt"])
stemmer = PorterStemmer()

def preprocess_tweet(text):
    text = text.lower()
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"@\w+|#", "", text)
    text = re.sub(r"[^\w\s]|[\d]", "", text)
    return " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])

# Load and sample smaller for safety
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None)
df.columns = ["sentiment", "ids", "date", "flag", "user", "tweet"]
df = df[["tweet", "sentiment"]].sample(500000, random_state=42).reset_index(drop=True)
df["sentiment"] = df["sentiment"].map({0: "Negative", 4: "Positive"})
df["clean_tweet"] = df["tweet"].apply(preprocess_tweet)
df["label"] = df["sentiment"].map({"Negative": 0, "Positive": 1})


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'training.1600000.processed.noemoticon.csv'

In [None]:
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["clean_tweet"])
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to dense safely (batch-wise size is small)
X_train_dense = tf.convert_to_tensor(X_train_scaled.toarray(), dtype=tf.float32)
X_test_dense = tf.convert_to_tensor(X_test_scaled.toarray(), dtype=tf.float32)
y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)


#Multi Layer Perceptron

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train on GPU
model.fit(X_train_dense, y_train, validation_split=0.2, epochs=3, batch_size=32, verbose=1)


In [None]:
# Evaluate
loss, accuracy = model.evaluate(X_test_dense, y_test)
print(f"\n Test Accuracy: {accuracy:.4%}")

# Predict
y_pred_probs = model.predict(X_test_dense)
y_pred_classes = (y_pred_probs > 0.5).astype(int).flatten()

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred_classes))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Keras MLP")
plt.show()


#LLMs (BERT and RoBERTa)

##BERT

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch

# Load original dataset
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None)
df.columns = ["sentiment", "ids", "date", "flag", "user", "tweet"]
df = df[["tweet", "sentiment"]]
df["sentiment"] = df["sentiment"].map({0: "Negative", 4: "Positive"})
df = df.dropna().sample(50000, random_state=42).reset_index(drop=True)
df["label"] = df["sentiment"].map({"Negative": 0, "Positive": 1})
df = df[["tweet", "label"]]


In [None]:
from transformers import BertTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification


#tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["tweet"], padding="max_length", truncation=True, max_length=64)

# Convert to HF datasets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize
train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["tweet", "__index_level_0__"])
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["tweet", "__index_level_0__"])

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
#model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
training_args = TrainingArguments(
    output_dir="./bert-output",
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        pred = outputs.logits.argmax(dim=1).item()
        return "Positive" if pred == 1 else "Negative"


In [None]:
print(predict_sentiment("I love using BERT for NLP tasks!"))

In [None]:
from sklearn.metrics import classification_report

# Predict on test set
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = predictions.label_ids

# Report
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))


##RoBERTa

In [None]:
from transformers import BertTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification


tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["tweet"], padding="max_length", truncation=True, max_length=64)

# Convert to HF datasets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize
train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["tweet", "__index_level_0__"])
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["tweet", "__index_level_0__"])

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
from transformers import BertForSequenceClassification


model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
training_args = TrainingArguments(
    output_dir="./bert-output",
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        pred = outputs.logits.argmax(dim=1).item()
        return "Positive" if pred == 1 else "Negative"


In [None]:
print(predict_sentiment("I love using BERT for NLP tasks!"))

In [None]:
from sklearn.metrics import classification_report

# Predict on test set
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = predictions.label_ids

# Report
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))
