First things first, let's start by downloading and loading our data. I picked this dataset from kaggle, here's the link. https://www.kaggle.com/datasets/venky73/spam-mails-dataset?resource=download Then I just loaded the dataset.

In [1]:
import pandas as pd

file_path = 'spam_ham_dataset.csv'
data = pd.read_csv(file_path)

print(data.head())
print(data.info())
print(data.describe())
print(data.isnull().sum())

   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB
None
        Unnamed: 0    label_num
count  5171.000000  5171.00000

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


The dataset contains 4 columns: Unnamed: 0: An index column (can be dropped). label: The label of the email (ham for non-spam and spam for spam). text: The content of the email. label_num: The numeric version of the label (0 for ham, 1 for spam). There are 5,171 entries with no missing values.

Then lets start preprocessing: Clean the text by removing punctuation, numbers, special characters, and stopwords. Then, perform tokenization and stemming.

In [1]:
import re
import string

manual_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 
    'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 
    'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
    'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 
    "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', 
    "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 
    'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', 
    "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
])

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters using regex
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Tokenize by splitting the string into words
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in manual_stopwords]
    # Join tokens back into a single string
    return " ".join(tokens)

data['cleaned_text'] = data['text'].apply(preprocess_text)

data[['text', 'cleaned_text']].head()

After preprocessing, convert the preprocessed text into numerical features using TF-IDF.

In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB 262.6 kB/s eta 0:00:42
   ---------------------------------------- 0.0/11.0 MB 262.6 kB/s eta 0:00:42
   ---------------------------------------- 0.0/11.0 MB 262.6 kB/s eta 0:00:42
   ---------------------------------------- 0.0/11.0 MB 262.6 kB/s eta 0:00:42
   ---------------------------------------- 0.0/11.0 MB 262.6 kB/s eta 0:00:42
 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text']).toarray()

# Labels (spam or not spam)
y = data['label_num']

The next step is model training: Train a classifier using Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Now, let's evaluate the result

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Define the parameter grid for Naive Bayes
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],  # Smoothing parameter
}

# Initialize GridSearchCV with the Naive Bayes model
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model to the training data
grid_nb.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for Naive Bayes:", grid_nb.best_params_)
print("Best Cross-Validation Accuracy for Naive Bayes:", grid_nb.best_score_)

# Evaluate the tuned model on the test set
nb_best_model = grid_nb.best_estimator_
y_pred_nb = nb_best_model.predict(X_test)

# Show evaluation metrics
print("Test Accuracy for Naive Bayes:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

## 1. Data Cleaning and Preprocessing

In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
df = pd.read_csv('customer_churn_data.csv')

# Handle missing values
df = df.dropna()  # Simple strategy, depending on the data

# Encode categorical variables
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])

# Scale numerical features
scaler = StandardScaler()
df[['age', 'income']] = scaler.fit_transform(df[['age', 'income']])


## 2. Exploratory Data Analysis (EDA)

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Plot distributions
sns.histplot(df['age'], kde=True)
plt.show()

# Correlation matrix
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

# Churn distribution
sns.countplot(x='churn', data=df)
plt.show()


## 3. Feature Engineering

In [None]:

from sklearn.feature_selection import mutual_info_classif

# Example: Create a new feature
df['interaction_rate'] = df['num_interactions'] / df['tenure']

# Feature importance
X = df.drop('churn', axis=1)
y = df['churn']
feature_importance = mutual_info_classif(X, y)
important_features = X.columns[feature_importance > 0.01]  # Choose a threshold based on analysis

## 4. Model Selection and Training

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

## 5. Model Evaluation and Fine-Tuning

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Evaluate model
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

# Hyperparameter tuning
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

## 6. Implementing the Model in a Real-World System

In [None]:
import joblib
from flask import Flask, request, jsonify

# Save the model
joblib.dump(grid_search.best_estimator_, 'churn_model.pkl')

# Flask app for serving predictions
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    prediction = model.predict(pd.DataFrame([data]))
    return jsonify({'churn': prediction[0]})

if __name__ == '__main__':
    model = joblib.load('churn_model.pkl')
    app.run(debug=True)