In [3]:
# Install necessary libraries
!pip install dvc scikit-learn pandas --quiet

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
DATA_DIR = "/content/drive/MyDrive/sms+spam+collection"
TRAIN_DATA = os.path.join(DATA_DIR, "train.csv")
VAL_DATA = os.path.join(DATA_DIR, "validation.csv")
TEST_DATA = os.path.join(DATA_DIR, "test.csv")

# Pull latest tracked dataset versions
%cd "{DATA_DIR}"
!dvc pull train.csv.dvc validation.csv.dvc test.csv.dvc

# Load datasets
train_df = pd.read_csv(TRAIN_DATA)
val_df = pd.read_csv(VAL_DATA)
test_df = pd.read_csv(TEST_DATA)

# Extract text and labels
X_train, y_train = train_df['text'], train_df['label']
X_val, y_val = val_df['text'], val_df['label']
X_test, y_test = test_df['text'], test_df['label']

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Train a simple Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Validate the model
val_preds = model.predict(X_val_tfidf)
val_acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_acc:.4f}")
print("\nValidation Classification Report:\n", classification_report(y_val, val_preds))

# Test the model
test_preds = model.predict(X_test_tfidf)
test_acc = accuracy_score(y_test, test_preds)
print(f"\nTest Accuracy: {test_acc:.4f}")
print("\nTest Classification Report:\n", classification_report(y_test, test_preds))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/sms+spam+collection
Collecting          |0.00 [00:00,    ?entry/s]
Fetching
![A
  0% |          |0/? [00:00<?,    ?files/s][A
Fetching
Building workspace index          |3.00 [00:00,  419entry/s]
Comparing indexes          |4.00 [00:00, 2.59kentry/s]
Applying changes          |0.00 [00:00,     ?file/s]
Everything is up to date.
[0mValidation Accuracy: 0.9545

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       724
           1       0.97      0.68      0.80       112

    accuracy                           0.95       836
   macro avg       0.96      0.84      0.89       836
weighted avg       0.96      0.95      0.95       836


Test Accuracy: 0.9689

Test Classification Report:
               precision    recall  f1-score   support

           0 