# Task 2. Experimental Evaluation of NLP Pipelines

https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus

In [None]:
# Download spaCy English model
!python -m spacy download en_core_web_sm

# Import libraries
import spacy
import pandas as pd
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("ner.csv")
df.head()

### Classical NLP using tokenization, stemming, POS tagging (Spacy)

In [None]:
import spacy
from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report

# Initialize spaCy and stemmer
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

true_labels = []
pred_labels_classical = []

for i, row in df.iterrows():
    sentence = row['Sentence']
    tokens = eval(row['POS'])
    tags = eval(row['Tag'])

    true_labels.extend(tags)

    # Initialize predictions as 'O'
    pred = ['O'] * len(tokens)

    # Process sentence
    doc = nlp(sentence)

    # Classical NLP steps: tokenization, POS tagging, stemming
    for idx, token in enumerate(doc):
        stem = stemmer.stem(token.text)
        pos = token.pos_


    # Map spaCy detected entities
    for ent in doc.ents:
        # Find index of first matching token in dataset tokens
        for idx, tok in enumerate(tokens):
            if ent.text.startswith(tok):
                pred[idx] = 'B-' + ent.label_
                # If entity spans multiple tokens, mark as I-
                ent_words = ent.text.split()
                for j in range(1, len(ent_words)):
                    if idx+j < len(pred):
                        pred[idx+j] = 'I-' + ent.label_
                break

    pred_labels_classical.extend(pred)

# Evaluate
print("Classical NLP Performance:")
print(classification_report(true_labels, pred_labels_classical, zero_division=0))


### Transformer-based pipeline using a pre-trained model (BERT)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.metrics import classification_report
import pandas as pd

# Initialize BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# NER pipeline
ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

pred_labels_bert = []
true_labels = []

for i, row in df.iterrows():
    sentence = row['Sentence']
    tokens = eval(row['POS'])
    tags = eval(row['Tag'])

    true_labels.extend(tags)

    # Predict with BERT
    ner_results = ner_pipe(sentence)

    # Initialize prediction list as 'O'
    pred = ['O'] * len(tokens)

    # Map BERT entity spans to dataset tokens
    for ent in ner_results:
        char_idx = 0
        start_token_idx = None
        end_token_idx = None
        for idx, tok in enumerate(tokens):
            token_len = len(tok)
            if char_idx <= ent['start'] < char_idx + token_len:
                start_token_idx = idx
            if char_idx < ent['end'] <= char_idx + token_len:
                end_token_idx = idx
            char_idx += token_len + 1  # +1 for space

        if start_token_idx is not None and end_token_idx is not None:
            pred[start_token_idx] = 'B-' + ent['entity_group']
            for j in range(start_token_idx+1, end_token_idx+1):
                pred[j] = 'I-' + ent['entity_group']

    pred_labels_bert.extend(pred)

# Evaluate
print("Transformer-based BERT Performance:")
print(classification_report(true_labels, pred_labels_bert, zero_division=0))

# -------------- Task 5: Independent Mini Project: Real-World Application Challenge-----------------

https://www.kaggle.com/datasets/anjaneyatripathi/emotion-classification-nlp?select=emotion-labels-train.csv

In [None]:
import pandas as pd

# load data
train = pd.read_csv("emotion-labels-train.csv")
val   = pd.read_csv("emotion-labels-val.csv")
test  = pd.read_csv("emotion-labels-test.csv")

In [None]:
# check null values
print("Train set nulls:\n", train.isnull().sum())
print("\nValidation set nulls:\n", val.isnull().sum())
print("\nTest set nulls:\n", test.isnull().sum())

### Design and implement Classical NLP using tokenization, stemming, POS tagging (Spacy)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# TF-IDF Vectorization (fit on train, transform on val/test)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vect = vectorizer.fit_transform(train["text"])
X_val_vect   = vectorizer.transform(val["text"])
X_test_vect  = vectorizer.transform(test["text"])

y_train = train["label"]
y_val   = val["label"]
y_test  = test["label"]

# Logistic Regression model
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_vect, y_train)


### Evaluate Logistic Regression of NLP Pipeline

In [None]:
# Predictions on validation and test sets
y_val_pred  = clf.predict(X_val_vect)
y_test_pred = clf.predict(X_test_vect)

# Evaluation - Validation
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))

# Evaluation - Test
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))


In [None]:
# Confusion Matrix - Test
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Test Set")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

# Overall accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Generate classification report as dict
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

# Convert to DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Keep only precision, recall, f1-score (exclude support)
metrics_df = report_df[['precision', 'recall', 'f1-score']].iloc[:-3]  # classes only

# Add accuracy column (same for all classes)
metrics_df['accuracy'] = accuracy

# Reorder columns if desired
metrics_df = metrics_df[['accuracy', 'precision', 'recall', 'f1-score']]

# Plot heatmap
plt.figure(figsize=(10, len(metrics_df)*0.5 + 2))  # adjust height by number of classes
sns.heatmap(metrics_df.astype(float), annot=True, cmap="YlGnBu", fmt=".2f", cbar_kws={'label': 'Score'})
plt.title("Classification Metrics per Class - Test Set")
plt.ylabel("Class")
plt.xlabel("Metrics")
plt.show()


### Design and implementTransformer-based pipeline using a pre-trained model DistilBERT

In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

# Encode labels
le = LabelEncoder()
train["label_enc"] = le.fit_transform(train["label"])
val["label_enc"]   = le.transform(val["label"])
test["label_enc"]  = le.transform(test["label"])

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize datasets
def tokenize(batch):
    return tokenizer(batch["text"].tolist(), truncation=True, padding=True)

X_train = tokenize(train)
X_val   = tokenize(val)
X_test  = tokenize(test)

# Convert to tf.data.Dataset
def make_dataset(encodings, labels, batch_size=16, shuffle=False):
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = make_dataset(X_train, train["label_enc"].values, batch_size=16, shuffle=True)
val_dataset   = make_dataset(X_val, val["label_enc"].values, batch_size=16)
test_dataset  = make_dataset(X_test, test["label_enc"].values, batch_size=16)

# Load DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(le.classes_),
    from_pt=True
)

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# Train model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=4
)


### Evaluate the performance

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Build test dataset
test_dataset = make_dataset(X_test, test["label_enc"].values, batch_size=16)

# Predictions
preds = model.predict(test_dataset)
y_pred = np.argmax(preds["logits"], axis=1)

# Evaluation
print("Test Accuracy:", accuracy_score(test["label_enc"], y_pred))
print("\nClassification Report (Test):\n",
      classification_report(test["label_enc"], y_pred, target_names=le.classes_))


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

# Compute classification report as dict
report_dict = classification_report(
    test["label_enc"],
    y_pred,
    output_dict=True,
    target_names=le.classes_
)

# Convert to DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Keep only per-class rows
report_df = report_df.loc[le.classes_, ['precision', 'recall', 'f1-score']]

# Add overall accuracy column (same for all classes)
overall_acc = accuracy_score(test["label_enc"], y_pred)
report_df['accuracy'] = overall_acc

# Reorder columns: acc, prec, recall, f1
report_df = report_df[['accuracy', 'precision', 'recall', 'f1-score']]

# Plot heatmap
plt.figure(figsize=(10,6))
sns.heatmap(report_df, annot=True, cmap="YlOrRd", fmt=".2f", cbar=True)

plt.title("Classification Metrics per Class (DistilBERT)", fontsize=14)
plt.xlabel("Metrics")
plt.ylabel("Classes")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Confusion Matrix - Test
cm = confusion_matrix(test["label_enc"], y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens",
            xticklabels=le.classes_,  # class names
            yticklabels=le.classes_)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - DistilBERT (Test Set)")
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

# Prepare test labels
y_test_bin = label_binarize(y_test, classes=clf.classes_)   # for Logistic Regression
classes = clf.classes_

# Logistic Regression Probabilities
y_test_proba_logreg = clf.predict_proba(X_test_vect)

# DistilBERT Probabilities (apply softmax to logits)
from scipy.special import softmax
y_test_proba_distil = softmax(preds["logits"], axis=1)

# Compute ROC curve & AUC for each model
fpr_logreg, tpr_logreg, _ = roc_curve(y_test_bin.ravel(), y_test_proba_logreg.ravel())
auc_logreg = auc(fpr_logreg, tpr_logreg)

fpr_distil, tpr_distil, _ = roc_curve(y_test_bin.ravel(), y_test_proba_distil.ravel())
auc_distil = auc(fpr_distil, tpr_distil)

# Plot ROC curves
plt.figure(figsize=(7,6))

plt.plot(fpr_logreg, tpr_logreg, color="blue", lw=2,
         label=f"Logistic Regression (TF-IDF)")

plt.plot(fpr_distil, tpr_distil, color="red", lw=2,
         label=f"DistilBERT")

# Baseline
plt.plot([0,1], [0,1], "k--", lw=1.5, label="Random Guess")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves per Class")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


# -----------Task 4: Word Representations for Semantic Reasoning ----------------

https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Load dataset
df = pd.read_csv("Reviews.csv")

df.head()

### Train your own Word2Vec model

In [None]:
# Tokenize review texts
sentences = df['Text'].dropna().apply(lambda x: simple_preprocess(x))

# Train Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
model.save("amazon_word2vec.model")

### Analyze semantic similarity

In [None]:
# Example: similarity between words
word1, word2 = "dog", "cat"

similarity = model.wv.similarity(word1, word2)
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.3f}")

# Find most similar words
print(model.wv.most_similar("quality", topn=5))

### Design a mini experiment Clustering to test

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Select some words from vocab
words = ['dog','cat','food','good','bad','taste','love','hate','price','quality']
vectors = np.array([model.wv[w] for w in words])

# KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=0).fit(vectors)
for w, label in zip(words, kmeans.labels_):
    print(f"{w}: Cluster {label}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import pandas as pd

# PCA reduce
pca = PCA(n_components=2)
reduced = pca.fit_transform(vectors)

df_plot = pd.DataFrame({
    "word": words,
    "x": reduced[:,0],
    "y": reduced[:,1],
    "cluster": kmeans.labels_
})

# Get a palette for clusters
palette = sns.color_palette("Set2", n_colors=len(df_plot["cluster"].unique()))

plt.figure(figsize=(10,8))

# Plot per cluster so legend works
for cluster_id in df_plot["cluster"].unique():
    cluster_data = df_plot[df_plot["cluster"] == cluster_id]
    plt.scatter(cluster_data["x"], cluster_data["y"],
                s=1800,
                color=palette[cluster_id],
                edgecolor="k", alpha=0.8,
                label=f"Cluster {cluster_id}")   # 👈 legend label

    # Annotate words inside circles
    for i, row in cluster_data.iterrows():
        plt.text(row.x, row.y, row.word,
                 ha="center", va="center", fontsize=12, weight="bold")

# Titles and labels
plt.title("Word Embedding Clusters (KMeans)", fontsize=16)
plt.xlabel("PCA Dimension 1", fontsize=14)
plt.ylabel("PCA Dimension 2", fontsize=14)

# Show legend
plt.legend(title="Clusters", fontsize=12, title_fontsize=13)

plt.show()

# Task 3. Responsible NLP Case Study

https://www.kaggle.com/competitions/nlp-getting-started/data?select=train.csv

In [None]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# training data with labels
train_df = pd.read_csv("train.csv")

In [None]:
df = df[['text', 'target']].dropna()
def clean_text(s):
    s = s.lower()
    s = re.sub(r"http\S+|www\S+|https\S+", "", s)
    s = re.sub(r"@\w+", "", s)
    s = re.sub(r"[^a-z0-9\s#']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s
df['text'] = df['text'].astype(str).map(clean_text)

### Train-Test Split and Vectorize

In [None]:
# Split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['target'], test_size=0.20, random_state=42, stratify=df['target']
)

In [None]:
# Vectorize

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

### Train and Evaluate Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_tfidf, y_train)

In [None]:
#  Evaluate
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score

# --- Scores ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

scores = {
    "Accuracy": acc,
    "Precision": prec,
    "Recall": rec,
    "F1-score": f1
}

# --- Bar Plot ---
plt.figure(figsize=(6,4))
bars = plt.bar(scores.keys(), scores.values(), color=['skyblue','orange','green','red'])
plt.ylim(0,1)
plt.title("Model Evaluation Metrics")

# Add score labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height+0.02, f"{height:.2f}",
             ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.show()

In [None]:
# --- Confusion Matrix Heatmap ---
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0,1], yticklabels=[0,1])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

### Save Misclassified Samples to check Hallucination and failure cases

In [None]:
#  Save misclassified examples (with prediction probability)
probs = model.predict_proba(X_test_tfidf)  # shape (n_samples, 2)
pred_conf = probs.max(axis=1)
results = pd.DataFrame({
    'text': X_test.values,
    'true': y_test.values,
    'pred': y_pred,
    'pred_confidence': pred_conf
})
misclassified = results[results['true'] != results['pred']].sort_values('pred_confidence', ascending=False)
misclassified.to_csv("misclassified_examples.csv", index=False)
print(f"\nSaved {len(misclassified)} misclassified examples to misclassified_examples.csv")
print("Top 5 confident wrong predictions:")
print(misclassified.head(5))

### Save keyword bias check

In [None]:
# Quick keyword-based bias check
keywords = ['allah','pray','bomb','muslim','hurricane','earthquake','flood','fire','attack','refugee','kashmir']
bias_rows = []
full_X = df['text']
full_y = df['target']
for kw in keywords:
    mask = full_X.str.contains(r'\b'+re.escape(kw)+r'\b', case=False, na=False)
    subset = df[mask]
    if len(subset)==0:
        continue
    preds = model.predict(vectorizer.transform(subset['text']))
    bias_rows.append({
        'keyword': kw,
        'count': len(subset),
        'pred_disaster_rate': float(preds.mean()),
        'true_disaster_rate': float(subset['target'].mean())
    })
bias_df = pd.DataFrame(bias_rows).sort_values('count', ascending=False)
bias_df.to_csv("keyword_bias_check.csv", index=False)
print("\nSaved keyword bias check to keyword_bias_check.csv")
print(bias_df)