In [1]:
# Import necessary libraries for ML text classification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

print("All libraries imported successfully!")

All libraries imported successfully!


In [2]:
# Cell 2: Create sample spam vs ham dataset
import pandas as pd

data = {
    'text': [
        'You have won a free iPhone! Click here NOW!!!',
        'Hi, how are you doing today?',
        'Limited time offer: Get 50% off everything!',
        'Let\'s meet for coffee tomorrow afternoon',
        'Congratulations! You\'ve been selected to claim your prize!',
        'Just finished reading that book you recommended',
        'CLAIM YOUR FREE MONEY NOW - NO CREDIT CARD NEEDED',
        'Can you send me the meeting notes from yesterday?',
        'Your account has been compromised. Verify identity immediately!',
        'Looking forward to seeing you at the conference'
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
}

# Create DataFrame
df = pd.DataFrame(data)
print("Dataset created:")
print(df)
print(f"\nDataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

Dataset created:
                                                text  label
0      You have won a free iPhone! Click here NOW!!!      1
1                       Hi, how are you doing today?      0
2        Limited time offer: Get 50% off everything!      1
3           Let's meet for coffee tomorrow afternoon      0
4  Congratulations! You've been selected to claim...      1
5    Just finished reading that book you recommended      0
6  CLAIM YOUR FREE MONEY NOW - NO CREDIT CARD NEEDED      1
7  Can you send me the meeting notes from yesterday?      0
8  Your account has been compromised. Verify iden...      1
9    Looking forward to seeing you at the conference      0

Dataset shape: (10, 2)

Label distribution:
label
1    5
0    5
Name: count, dtype: int64


In [3]:
# Cell 3: Perform train-test split
from sklearn.model_selection import train_test_split

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nTraining set distribution:")
print(y_train.value_counts())
print(f"\nTest set distribution:")
print(y_test.value_counts())

Training set size: 8
Test set size: 2

Training set distribution:
label
0    4
1    4
Name: count, dtype: int64

Test set distribution:
label
1    1
0    1
Name: count, dtype: int64


In [4]:
# Cell 4: Apply TF-IDF vectorization and train Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=100, lowercase=True, stop_words='english')

# Fit vectorizer on training data and transform
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print(f"TF-IDF vectorizer created successfully!")
print(f"Training set vectorized shape: {X_train_vectorized.shape}")
print(f"Test set vectorized shape: {X_test_vectorized.shape}")

# Train Logistic Regression classifier
clf = LogisticRegression(random_state=42, max_iter=200)
clf.fit(X_train_vectorized, y_train)

print(f"\nLogistic Regression classifier trained successfully!")
print(f"Classes: {clf.classes_}")
print(f"Number of iterations: {clf.n_iter_}")

TF-IDF vectorizer created successfully!
Training set vectorized shape: (8, 35)
Test set vectorized shape: (2, 35)

Logistic Regression classifier trained successfully!
Classes: [0 1]
Number of iterations: [5]


In [5]:
# Cell 5: Train Logistic Regression classifier on TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Create and fit TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=100, lowercase=True, stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print(f"TF-IDF vectorizer created successfully!")
print(f"Training set vectorized shape: {X_train_vectorized.shape}")
print(f"Test set vectorized shape: {X_test_vectorized.shape}")
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

# Train Logistic Regression classifier
clf = LogisticRegression(random_state=42, max_iter=200)
clf.fit(X_train_vectorized, y_train)

print(f"\nLogistic Regression classifier trained successfully!")
print(f"Classes: {clf.classes_}")
print(f"Number of iterations: {clf.n_iter_}")

TF-IDF vectorizer created successfully!
Training set vectorized shape: (8, 35)
Test set vectorized shape: (2, 35)
Vocabulary size: 35

Logistic Regression classifier trained successfully!
Classes: [0 1]
Number of iterations: [5]


In [6]:
# Cell 6: Predict on test data and evaluate accuracy
from sklearn.metrics import accuracy_score

# Make predictions on test data
y_pred = clf.predict(X_test_vectorized)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print(f"Accuracy Percentage: {accuracy * 100:.2f}%")
print(f"\nPredictions: {y_pred}")
print(f"Actual labels: {y_test.values}")

Model Accuracy: 0.5000
Accuracy Percentage: 50.00%

Predictions: [0 0]
Actual labels: [1 0]


In [7]:
# Cell 7: Save trained model and vectorizer
import pickle

# Save the trained Logistic Regression model
with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Save the fitted TF-IDF vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved successfully!")
print("\nFiles created:")
print("- model.pkl (trained Logistic Regression classifier)")
print("- vectorizer.pkl (fitted TF-IDF vectorizer)")

Model and vectorizer saved successfully!

Files created:
- model.pkl (trained Logistic Regression classifier)
- vectorizer.pkl (fitted TF-IDF vectorizer)
