In [None]:

!pip install -q transformers datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk

nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from transformers import BertTokenizer, BertModel
import torch

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

### Feature Extraction

Choose either TF-IDF or BERT embeddings for converting the clean text into numerical features.

#### Option 1: TF-IDF Vectorization

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF features shape (Train):", X_train_tfidf.shape)
print("TF-IDF features shape (Test):", X_test_tfidf.shape)

TF-IDF features shape (Train): (25569, 5000)
TF-IDF features shape (Test): (6393, 5000)


In [None]:
if 'clean_text' in df.columns:
    print("'clean_text' column exists in the DataFrame.")
else:
    print("'clean_text' column does not exist in the DataFrame.")
    print("Available columns:", df.columns.tolist())

'clean_text' column does not exist in the DataFrame.
Available columns: ['tweet', 'label']


#### Option 2: BERT Embeddings (Optional)

*Note: Generating BERT embeddings can be computationally intensive and time-consuming, especially on larger datasets. This is an optional step.*

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state[:, 0, :].squeeze()
    return embeddings.numpy()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

!wget -q https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv -O imdb_reviews.csv

df = pd.read_csv("imdb_reviews.csv")
df = df[['tweet','label']]
df.head()

Unnamed: 0,tweet,label
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):

    text = text.lower()

    text = re.sub(r'http\S+', '', text)

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = nltk.word_tokenize(text)

    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['tweet'].apply(clean_text)
df.head()

Unnamed: 0,tweet,label,clean_text
0,@user when a father is dysfunctional and is s...,0,user father dysfunctional selfish drag kid dys...
1,@user @user thanks for #lyft credit i can't us...,0,user user thanks lyft credit cant use cause do...
2,bihday your majesty,0,bihday majesty
3,#model i love u take with u all the time in ...,0,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,0,factsguide society motivation


Key Questions / Analysis / Interpretation to be Evaluated

1. TF-IDF vs BERT:
 - TF-IDF: simple, fast, good for small datasets, ignores word order/context.
 - BERT: captures context, semantics, better for nuanced text, slower, requires GPU.
 - Prefer TF-IDF for classical ML models, BERT for deep learning or contextual tasks.

2. TF-IDF Vectorization:
 - Converts text into numeric vectors where each column represents a word/phrase.
 - Values are weighted by term frequency × inverse document frequency.

3. Model Performance:
 - Accuracy, Precision, Recall, F1-score are reported in the classification report.
 - Confusion matrix shows true positives/negatives and misclassifications.

4. Improvements:
 - Use BERT embeddings or DistilBERT for better contextual understanding.
 - Tune hyperparameters, use oversampling for class imbalance.
 - Add more labeled data for better generalization.

