In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import re
import seaborn as sns


## Read Dataset

In [None]:
#Reading the Data
data = pd.read_csv('/spam_ham_dataset')
data.head()

In [None]:
#Dropping the Unaned: 0 column
data.drop(['Unnamed: 0','label'],axis=1,inplace=True)

In [None]:
data.head()

## Exploratory Data Analysis

In [None]:
data['text'][0]

In [None]:
data['text'][10]

In [None]:
data['text'][4]

In [None]:
# Check for missing values
print(data['text'].isnull().sum())

# Check the distribution of target classes (e.g., spam or not spam)
print(data['label_num'].value_counts())


In [None]:
# Calculate the length of the 'text' column
data['text_length'] = data['text'].apply(len)

# Plot the distribution of text lengths
plt.figure(figsize=(8, 6))
plt.hist(data['text_length'], bins=50, color='skyblue', edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.show()


## Data Cleaning

In [None]:
# #Extracting Subject from Text column and making a new column
# import pandas as pd

# def extract_subject_and_text(email_text):
#     # Split the email at the first \r (carriage return)
#     subject = email_text.split('\r', 1)[0].replace('Subject:', '').strip()  
#     text = email_text.split('\r', 1)[-1].strip()
#     return pd.Series([subject, text])

# data[['subject', 'text']] = data['text'].apply(extract_subject_and_text)
# print(data[['subject', 'text']])


import pandas as pd
import re

def extract_subject_and_text(email_text):
    # Try to find the "Subject:" line and split
    subject_match = re.search(r'^Subject: (.*?)(\r|\n)', email_text)
    if subject_match:
        subject = subject_match.group(1).strip()
        text = email_text[subject_match.end(0):].strip()  # Everything after the subject
    else:
        # If no subject found, assume entire text is body
        subject = ''
        text = email_text.strip()
    
    # Check if the subject starts with 'Re:' (case-insensitive)
    reply = 1 if subject.lower().startswith('re :') else 0
    
    return pd.Series([subject, text, reply])

# Apply the function to extract subject, text, and the reply flag
data[['subject', 'text', 'reply']] = data['text'].apply(extract_subject_and_text)

# Check the results
print(data[['subject', 'text', 'reply']].head())


In [None]:
data.head()

In [None]:
# Count how many emails are replies (where 'reply' == 1)
num_replies = (data['reply'] == 1).sum()
print("Number of replies:", num_replies)

In [None]:
import matplotlib.pyplot as plt

# Filter for emails that are replies (reply == 1)
replies_data = data[data['reply'] == 1]

# Count how many of the replies are spam (num_labels == 1) and not spam (num_labels == 0)
num_spam_replies = (replies_data['label_num'] == 1).sum()
num_non_spam_replies = (replies_data['label_num'] == 0).sum()

# Create a dictionary with the counts
counts = {'Spam': num_spam_replies, 'Not Spam': num_non_spam_replies}

# Plot the data using a bar chart
plt.figure(figsize=(8, 6))
plt.bar(counts.keys(), counts.values(), color=['red', 'green'], edgecolor='black')

# Add titles and labels
plt.title('Spam vs Non-Spam Replies')
plt.xlabel('Reply Type')
plt.ylabel('Number of Replies')
plt.show()


In [None]:
# Create histograms for spam and non-spam emails
plt.figure(figsize=(10, 6))

# Plot for spam (num_labels == 1)
sns.histplot(data[data['label_num'] == 1]['text_length'], color='red', label='Spam', kde=True, bins=30)

# Plot for non-spam (num_labels == 0)
sns.histplot(data[data['label_num'] == 0]['text_length'], color='green', label='Non-Spam', kde=True, bins=30)

# Add title and labels
plt.title('Text Length Distribution for Spam vs Non-Spam Emails')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

# Display legend
plt.legend()

# Show the plot
plt.show()


In [None]:
# Replacing Carriage and Newline (\r & \n)
def remove_carriage_and_newline(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    return text

data['text'] = data['text'].apply(remove_carriage_and_newline)
data['subject'] = data['subject'].apply(remove_carriage_and_newline)

data['text'][2000]

In [None]:
# Removing Symbols
def remove_symbols(text):
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return text

data['text'] = data['text'].apply(remove_symbols)
data['subject'] = data['subject'].apply(remove_symbols)

data['text'][2000]

In [None]:
data['text'][2000]

In [None]:
#Convering all the letters in text and subject to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Example usage:
data['text'] = data['text'].apply(convert_to_lowercase)
data['subject'] = data['subject'].apply(convert_to_lowercase)

data['text'][2000]


In [None]:

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['text'] = data['text'].apply(remove_stopwords)
data['subject'] = data['subject'].apply(remove_stopwords)

data['text'][2000]

In [None]:
def remove_emails_and_urls(text):
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    return text

data['text'] = data['text'].apply(remove_emails_and_urls)
data['subject'] = data['subject'].apply(remove_emails_and_urls)

data['text'][2000]


In [None]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

data['text'] = data['text'].apply(remove_numbers)

data['subject'] = data['subject'].apply(remove_numbers)
data['text'][2000]


In [None]:
def correct_abbreviations(text):
    abbreviations = {
        'ect': 'etc',
        'mobil': 'mobile',
        'tel':'tell'
    }
    for abbr, full in abbreviations.items():
        text = re.sub(r'\b' + abbr + r'\b', full, text)
    return text

data['text'] = data['text'].apply(correct_abbreviations)

data['subject'] = data['subject'].apply(correct_abbreviations)
data['text'][50]



In [None]:
def remove_extra_spaces(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

# Apply the function to the 'text' column
data['text'] = data['text'].apply(remove_extra_spaces)
data['subject'] = data['subject'].apply(remove_extra_spaces)

data['text'][2000]

In [None]:
data['subject'][random.randint(1,5000)]

In [None]:
data.head()

data.drop('text_length',axis=1,inplace=True)
data.head()

In [None]:
# Splitting X and y
X = data[['text', 'subject']] 
y = data['label_num']

# Check the shape of X and y
print("Shape of X (features):", X.shape)
print("Shape of y (labels):", y.shape)

# Split the data into training and test sets (80% training, 20% testing)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train  # Check the training data


In [None]:
# Checking the first few rows of X_train
print(X_train.head())
print(X_train[:5]) 


In [None]:
from wordcloud import WordCloud

# Combine all text to create a word cloud
text_combined = ' '.join(X_train['text'])

# Generate word cloud
wordcloud = WordCloud(width=800, height=400).generate(text_combined)

# Plot the word cloud
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Most Frequent Words in Text')
plt.axis('off')
plt.show()


In [None]:
# Add target column to the training data
X_train['target'] = y_train

# Split data into spam and non-spam
spam_data = X_train[X_train['target'] == 1]
non_spam_data = X_train[X_train['target'] == 0]

# Get the most frequent words in the 'text' column for spam and non-spam emails
spam_text_combined = ' '.join(spam_data['text'])
nonspam_text_combined = ' '.join(non_spam_data['text'])

# Generate word cloud for spam text
wordcloud_spam = WordCloud(width=800, height=400, background_color='red').generate(spam_text_combined)

# Generate word cloud for non-spam text
wordcloud_nonspam = WordCloud(width=800, height=400, background_color='green').generate(nonspam_text_combined)

# Plot both word clouds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_spam, interpolation='bilinear')
plt.title('Most Frequent Words in Spam Text')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_nonspam, interpolation='bilinear')
plt.title('Most Frequent Words in Non-Spam Text')
plt.axis('off')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with English stopwords
vectorizer = CountVectorizer(stop_words='english', max_features=20)

# Fit and transform the 'text' column
X_train_text_matrix = vectorizer.fit_transform(X_train['text'])

# Get the top 20 most frequent words in the text column (excluding stop words)
top_words = vectorizer.get_feature_names_out()
word_counts = X_train_text_matrix.toarray().sum(axis=0)

# Plot top words
plt.figure(figsize=(10, 6))
plt.barh(top_words, word_counts, color='purple')
plt.title('Top 20 Most Frequent Words (Excluding Stop Words)')
plt.xlabel('Word Frequency')
plt.ylabel('Words')
plt.show()


In [None]:
X_combined = X_train['text'] + " " + X_train['subject']

# Check the first few rows of combined text
print(X_combined.head())

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the combined training data
X_train_tfidf = tfidf.fit_transform(X_combined)

# Combine 'text' and 'subject' for the test set as well
X_test_combined = X_test['text'] + " " + X_test['subject']
X_test_tfidf = tfidf.transform(X_test_combined)

# Check the shapes of the transformed data
print("Shape of X_train_tfidf:", X_train_tfidf.shape)
print("Shape of X_test_tfidf:", X_test_tfidf.shape)




In [None]:
# Check the first few rows of the data
print(data.head())

# Check the shape of X and y
print("Shape of X (features):", X.shape)
print("Shape of y (labels):", y.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Define the models to compare
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Initialize a dictionary to store the results
results = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)  # Train the model
    y_pred = model.predict(X_test_tfidf)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    results[model_name] = accuracy  # Store the result

# Print out the results
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")
    
# Optionally, you can sort the models by accuracy to find the best one
best_model = max(results, key=results.get)
print(f"\nBest Model: {best_model} with Accuracy: {results[best_model]:.4f}")


'''
Logistic Regression: 0.9816
Random Forest: 0.9739
SVM: 0.9903
KNN: 0.6783
Naive Bayes: 0.9517
Gradient Boosting: 0.9575

Best Model: SVM with Accuracy: 0.9903
'''

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best hyperparameters for SVM:", best_params)
# {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = grid_search.best_estimator_.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
# Access the best model from GridSearchCV
train_accuracy = grid_search.best_estimator_.score(X_train_tfidf, y_train)
test_accuracy = grid_search.best_estimator_.score(X_test_tfidf, y_test)

# Print out the train and test accuracy
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")


In [None]:
# Assuming grid_search is your GridSearchCV or RandomizedSearchCV object
best_params = grid_search.best_params_

# Initialize the model with the best parameters
final_model = SVC(**best_params, random_state=42,class_weight='balanced')

# Retrain the model on the entire training data
final_model.fit(X_train_tfidf, y_train)

# Evaluate on the test data
test_accuracy = final_model.score(X_test_tfidf, y_test)
print(f"Test Accuracy: {test_accuracy}")


In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(final_model, X_train_tfidf, y_train, cv=5)  # 5-fold cross-validation
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

# So we can say That the model is not overfitting as the acc of model and the Mean CV is almost same

In [None]:
from sklearn.metrics import classification_report

# Predict labels
y_pred = final_model.predict(X_test_tfidf)

# Print detailed classification metrics
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix
y_pred = final_model.predict(X_test_tfidf)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Spam", "Spam"], yticklabels=["Non-Spam", "Spam"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Example new data (replace with actual new data)
new_data = pd.DataFrame({
    'text': [
        """You have won a $1000 gift card! Claim it now by clicking the link below. Don't miss out on this exclusive offer!
Click here to claim your prize now: www.fake-link.com""",
        
        '''Hey, I wanted to follow up on our meeting from last week. Can we schedule a call to discuss the next steps for the project? Let me know what time works for you.''',
        
        '''I just wanted to let you know that we're offering a special discount of 20% off all our products for the next week. 
You can use the promo code "DISCOUNT20" at checkout to claim your savings. Let me know if you need any help with your order!''',

        '''Hello! We’re excited to offer you a special discount of 15% on all products this weekend. 
For example, you can get our latest smartwatch for only $149.99 (original price $179.99). 
Feel free to visit our store or check out our website for more details.
''',

        '''Hi [Name],

We hope you're enjoying your experience with [Product/Service]! 
We wanted to let you know that we've updated our offerings to give you even more value. 
Our regular membership is $99.99, but now it’s available for $89.99 price only for the next few weeks—designed to make it easier for you to continue enjoying all the features you love.

If you’re interested or have any questions, feel free to reach out! We’re here to help.

Best,  
[Your Company Name] Team
'''

    ]
})

# Preprocess the new data (clean and vectorize it using the same vectorizer)
new_data_tfidf = tfidf.transform(new_data['text'])

print("Shape of new data after TF-IDF transformation:", new_data_tfidf.shape)

# Assuming 'model' is your trained model (Logistic Regression in your case)
predictions = final_model.predict(new_data_tfidf)

# Map predictions to human-readable labels
prediction_labels = ['Spam' if pred == 1 else 'Not Spam' for pred in predictions]

# Show the results
for label in (prediction_labels):
    print(f"Prediction: {label}\n")


In [None]:
import joblib

# Assume you have a trained model called 'model'
# Save the model to a file
joblib.dump(final_model, 'model.joblib')
joblib.dump(tfidf, 'tfidf.joblib')