### IR LAB 2

##### Code and output:

In [16]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report

# Locate dataset (try common relative locations)
for p in ('CODES/IR_Practicals/datasets/spam.csv','datasets/spam.csv','spam.csv'):
    if Path(p).is_file():
        path = p
        break
else:
    raise FileNotFoundError('spam.csv not found in expected paths.')

# Load & normalize schema -> Category / Message
df = pd.read_csv(path, encoding='latin1')
cols = df.columns
if {'label','text'}.issubset(cols):
    df = df[['label','text']].rename(columns={'label':'Category','text':'Message'})
elif {'v1','v2'}.issubset(cols):
    df = df[['v1','v2']].rename(columns={'v1':'Category','v2':'Message'})
else:
    raise ValueError(f'Unexpected columns: {list(cols)}')

X_train, X_test, y_train, y_test = train_test_split(
    df.Message, df.Category, test_size=0.2, random_state=42, stratify=df.Category
)

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', ComplementNB(alpha=0.1))
])
pipe.fit(X_train, y_train)

print('Classification Report')
print(classification_report(y_test, pipe.predict(X_test)))

new_emails = [
    "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/claim-yours to claim now.",
    "Hi team, are we still on for the 3 PM meeting today? Please confirm.",
    "URGENT: Your account has been compromised. Please click here to reset your password immediately!",
    "This is for IR practicals please find the attachments below"
]
print('\nNew Email Predictions:')
for e, pred in zip(new_emails, pipe.predict(new_emails)):
    print(f'{pred.upper():4} | {e[:65]}...')

Classification Report
              precision    recall  f1-score   support

         ham       0.99      0.97      0.98       966
        spam       0.85      0.96      0.90       149

    accuracy                           0.97      1115
   macro avg       0.92      0.97      0.94      1115
weighted avg       0.97      0.97      0.97      1115


New Email Predictions:
SPAM | Congratulations! You've won a $1,000 Walmart gift card. Go to htt...
HAM  | Hi team, are we still on for the 3 PM meeting today? Please confi...
SPAM | URGENT: Your account has been compromised. Please click here to r...
HAM  | This is for IR practicals please find the attachments below...
