# ***Solution of Coding Tasks***

In [15]:
# Importing all necessary libraries
import pandas as pd
from datasets import load_dataset
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, matthews_corrcoef, confusion_matrix
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

## ***Task-1*** : Implementing Tokenization and Stemming using Python + NLTK/spaCy

In [2]:
# Loading the IMDb dataset
dataset = load_dataset("imdb")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
# Converting the dataset to pandas DataFrame
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

# Saving the DataFrames to CSV files
train_df.to_csv("imdb_train.csv", index=False)
test_df.to_csv("imdb_test.csv", index=False)

In [4]:
corpus_train = train_df['text'].values
labels_train = train_df['label'].values
corpus_test = test_df['text'].values
labels_test = test_df['label'].values
print("Total training samples:", len(corpus_train))
print("Total testing samples:", len(corpus_test))

Total training samples: 25000
Total testing samples: 25000


##### ***Text Pre-processing***

In [5]:
# Lowercasing the corpus
corpus_train_lower = []
corpus_test_lower = []
for i in range(len(corpus_train)):
    corpus_train_lower.append(corpus_train[i].lower())
for i in range(len(corpus_test)):
    corpus_test_lower.append(corpus_test[i].lower())

# Adding the Lowercased text column to the DataFrame
train_df["Lowercase Text"] = corpus_train_lower
test_df["Lowercase Text"] = corpus_test_lower

In [6]:
# Removing punctuations, special characters, HTML tags, URLs, and extra spaces from the corpus
corpus_train_cleaned = []
corpus_test_cleaned = []
for i in range(len(corpus_train_lower)):
    cleaned_text = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', re.sub(r'<.*?>', '', re.sub(r'http\S+|www\.\S+', '', corpus_train_lower[i])) )).strip()
    corpus_train_cleaned.append(cleaned_text)
for i in range(len(corpus_test_lower)):
    cleaned_text = re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', re.sub(r'<.*?>', '', re.sub(r'http\S+|www\.\S+', '', corpus_test_lower[i])) )).strip()
    corpus_test_cleaned.append(cleaned_text)

# Adding the Cleaned text column to the DataFrame
train_df["Cleaned Text"] = corpus_train_cleaned
test_df["Cleaned Text"] = corpus_test_cleaned

In [7]:
# Tokenization of the cleaned corpus
tokenizer = word_tokenize
corpus_train_tokenized = []
corpus_test_tokenized = []
for i in range(len(corpus_train_cleaned)):
    tokens = tokenizer(corpus_train_cleaned[i])
    corpus_train_tokenized.append(tokens)
for i in range(len(corpus_test_cleaned)):
    tokens = tokenizer(corpus_test_cleaned[i])
    corpus_test_tokenized.append(tokens)

# Adding the Tokenized text column to the DataFrame
train_df["Tokenized Text"] = corpus_train_tokenized
test_df["Tokenized Text"] = corpus_test_tokenized

In [8]:
# Removing stopwords from the tokenized corpus
stop_words = set(stopwords.words('english'))
corpus_train_no_stopwords = []
corpus_test_no_stopwords = []
for tokens in corpus_train_tokenized:
    filtered_tokens = [word for word in tokens if word not in stop_words]
    corpus_train_no_stopwords.append(filtered_tokens)
for tokens in corpus_test_tokenized:
    filtered_tokens = [word for word in tokens if word not in stop_words]
    corpus_test_no_stopwords.append(filtered_tokens)

# Adding the No Stopwords text column to the DataFrame
train_df["No Stopwords Text"] = corpus_train_no_stopwords
test_df["No Stopwords Text"] = corpus_test_no_stopwords

In [9]:
# Stemming
stemmer = PorterStemmer()
corpus_train_stemmed = []
corpus_test_stemmed = []
for i in range(len(corpus_train_no_stopwords)):
    stemmed_tokens = stemmer.stem(' '.join(corpus_train_no_stopwords[i]))
    corpus_train_stemmed.append(stemmed_tokens)
for i in range(len(corpus_test_no_stopwords)):
    stemmed_tokens = stemmer.stem(' '.join(corpus_test_no_stopwords[i]))
    corpus_test_stemmed.append(stemmed_tokens)

# Adding the Stemmed text column to the DataFrame
train_df["Stemmed Text"] = corpus_train_stemmed
test_df["Stemmed Text"] = corpus_test_stemmed

## ***Task-2*** : Build a TF-IDF text classifier for sentiment(IMDb or SST-2)

In [12]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2)
)

X_train = tfidf.fit_transform(train_df["Stemmed Text"])
X_test  = tfidf.transform(test_df["Stemmed Text"])

y_train = train_df["label"].values
y_test  = test_df["label"].values

In [14]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),

    "Softmax Regression": LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000
    ),

    "Naive Bayes": MultinomialNB(alpha=1.0),

    "Linear SVM": LinearSVC(),

    "Ridge Classifier": RidgeClassifier(),

    "SGD Classifier": SGDClassifier(
        loss="log_loss",
        max_iter=1000,
        tol=1e-3
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1
    ),

    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model



In [16]:
smooth = SmoothingFunction().method1
results = {}

for name, model in trained_models.items():
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = None

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "mcc": matthews_corrcoef(y_test, y_pred),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }

    if y_prob is not None:
        metrics["roc_auc"] = roc_auc_score(y_test, y_prob)
        metrics["log_loss"] = log_loss(y_test, y_prob)
    else:
        metrics["roc_auc"] = None
        metrics["log_loss"] = None

    bleu_scores = [
        sentence_bleu([[str(t)]], [str(p)], smoothing_function=smooth)
        for t, p in zip(y_test, y_pred)
    ]
    metrics["bleu"] = np.mean(bleu_scores)

    results[name] = metrics

In [17]:
for model, metrics in results.items():
    print(f"\n{model}")
    for k, v in metrics.items():
        print(f"{k}: {v}")

best_model_name = max(results, key=lambda x: results[x]["f1"])
best_model = trained_models[best_model_name]

print("\nBEST MODEL BASED ON F1:", best_model_name)


Logistic Regression
accuracy: 0.88748
precision: 0.8861516383640278
recall: 0.8892
f1: 0.8876732020924011
mcc: 0.7749645853240237
confusion_matrix: [[11072  1428]
 [ 1385 11115]]
roc_auc: 0.9550505471999999
log_loss: 0.3269663409204867
bleu: 0.15781874108213434

Softmax Regression
accuracy: 0.88968
precision: 0.8910565189466924
recall: 0.88792
f1: 0.8894854944702677
mcc: 0.7793648283359408
confusion_matrix: [[11143  1357]
 [ 1401 11099]]
roc_auc: 0.9568329471999999
log_loss: 0.29847753838544466
bleu: 0.1582099625523429

Naive Bayes
accuracy: 0.86064
precision: 0.8780610533378062
recall: 0.8376
f1: 0.857353422862758
mcc: 0.7220469925294833
confusion_matrix: [[11046  1454]
 [ 2030 10470]]
roc_auc: 0.9354395520000001
log_loss: 0.378983809489333
bleu: 0.15304583914558986

Linear SVM
accuracy: 0.88296
precision: 0.8903930843255586
recall: 0.87344
f1: 0.881835069865116
mcc: 0.7660588690303443
confusion_matrix: [[11156  1344]
 [ 1582 10918]]
roc_auc: None
log_loss: None
bleu: 0.1570149587887

In [18]:
def predict_sentiment(texts):
    X = tfidf.transform(texts)
    return best_model.predict(X)