In [1]:
import kagglehub
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix, f1_score
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


# nltk.download('all')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
analyzer = SentimentIntensityAnalyzer()

# create preprocess_text function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

def nltk_get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment


def train(X_train, y_train, classifier):
    """Train the sentiment analysis model"""
    print("Training model...")
    classifier.fit(X_train, y_train)

def evaluate(y_pred, y_test):

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\n F1-Score = ", f1_score(y_test, y_pred))
    return



def predict(text, vectorizer, classifier):
    """Predict sentiment for new text"""
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = classifier.predict(vectorized_text)
    probability = classifier.predict_proba(vectorized_text)

    return {
        'sentiment': 'positive' if prediction[0] == 1 else 'negative',
        'confidence': max(probability[0])
    }

def tfid_predict(X_train, y_train, X_test):
    tfid_vectorizer = TfidfVectorizer(max_features=5000)
    classifier = LogisticRegression(max_iter=1000)
    # Vectorize text
    print("Vectorizing text...")
    X_train_vec = tfid_vectorizer.fit_transform(X_train)
    X_test_vec = tfid_vectorizer.transform(X_test)

    train(X_train_vec, y_train, classifier)

    return classifier.predict(X_test_vec)

In [3]:

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print('device = ', device)


def generate_prompt(input):
    return f"""Please perform Sentiment Classification task.
            Given the sentence, assign a sentiment label
            from ['negative', 'positive']. Return label only
            without any other text.
            Sentence: Oh , and more entertaining, too.
            Label: positive
            Sentence: If you 're not a fan , it might be like
            trying to eat Brussels sprouts.
            Label: negative
            Sentence: {input}.
            Label: """


def generate_text_from_prompt(model, tokenizer, prompt):
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response


def LLM_predict(X_train, y_train, X_test):
    model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # torch_dtype="auto",
        # device_map="auto"
        low_cpu_mem_usage=True
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    output_sentiments = [
        generate_text_from_prompt(model, tokenizer, generate_prompt(txt)) for txt in X_test
    ]
    print(output_sentiments)
    return np.where(np.array(output_sentiments)=='positive', 1, 0)





device =  cuda


In [None]:
# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

df = pd.read_csv(path + "/IMDB Dataset.csv")#.iloc[:20]
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})



tqdm.pandas()
# clean/preprocess the review text
df['text_cleaned'] = df['review'].progress_apply(preprocess_text)

# df['nltk_sentiment'] = df['review_cleaned'].progress_apply(nltk_get_sentiment)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text_cleaned'], df['sentiment'], test_size=0.2, random_state=42
)

print("---------> LLM evaluation")
llm_y_pred = LLM_predict(X_train, y_train, X_test)
evaluate(llm_y_pred, y_test)


print("------------> TFID evaluation:")
tfid_y_pred = tfid_predict(X_train, y_train, X_test)
evaluate(tfid_y_pred, y_test)

print("------------> nltk sentiment evaluation:")
nltk_sentiment_y_pred = [nltk_get_sentiment(txt) for txt in X_test]
evaluate(nltk_sentiment_y_pred, y_test)

print("------------> nltk sentiment & TFID evaluation:")
mixed_y_pred = tfid_y_pred * nltk_sentiment_y_pred
evaluate(mixed_y_pred, y_test)

Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


100%|██████████| 50000/50000 [01:55<00:00, 433.68it/s]


---------> LLM evaluation


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]