# Stage 0: Enviroment setting

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
warnings.filterwarnings('ignore')

#Task 1:
Define the expected structure of the input dataset, clearly identifying feature columns and the binary target label, and implement logic to load training and validation data.



In [None]:
# Importing the Dataset

share_url_train = "https://drive.google.com/file/d/15TxsVm_Vd2Hs70tZc-WB6AnsyKlB1kx8/view"
download_url_train="https://drive.google.com/uc?export=download&id="+share_url_train.split("/")[-2]
df_train = pd.read_csv(download_url_train)

share_url_val = "https://drive.google.com/file/d/1AwyrhlWVV4B9Xt98vtEhxh4jnss7NHfa/view"
download_url_val="https://drive.google.com/uc?export=download&id="+share_url_val.split("/")[-2]
df_val = pd.read_csv(download_url_val)

print(f"\nTraining Data Shape: {df_train.shape}")
print(f"Validation Data Shape: {df_val.shape}")
print(f"\nTraining Data Columns: {list(df_train.columns)}")


Training Data Shape: (640, 5)
Validation Data Shape: (160, 5)

Training Data Columns: ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'label']


In [None]:
df_train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,label
0,0.51407,5.142371,0.185182,5,0
1,2.384125,5.212283,0.644438,0,0
2,-0.26024,5.986589,0.521944,0,0
3,-1.983636,3.219447,0.920631,2,0
4,-1.130475,7.367578,0.068803,6,0


In [None]:
df_train.dtypes

Unnamed: 0,0
feature_1,float64
feature_2,float64
feature_3,float64
feature_4,int64
label,int64


# Task 2:
Perform an initial data inspection to understand feature ranges, class balance, and potential data quality issues that may affect model training.

In [None]:
# Unserstand feature ranges, class balance, and data quality issues

print("Data Inspection and Quality Analysis")

print("\nMissing Values:")
print(f"\nTraing set:\n{df_train.isnull().sum()}")
print(f"\nValidation set:\n{df_val.isnull().sum()}")

Data Inspection and Quality Analysis

Missing Values:

Traing set:
feature_1    0
feature_2    0
feature_3    0
feature_4    0
label        0
dtype: int64

Validation set:
feature_1    0
feature_2    0
feature_3    0
feature_4    0
label        0
dtype: int64


In [None]:
# Duplicates checking
print(f"\nDuplicate Messages:")
print(f"\nTraining duplicates: {df_train.duplicated().sum()}")
print(f"\nValidation duplicates: {df_val.duplicated().sum()}")


Duplicate Messages:

Training duplicates: 0

Validation duplicates: 0


In [None]:
# Identify feature and target columns
feature_cols = [col for col in df_train.columns if col != 'label']
target_col = 'label'

print("Identified Columns")
print(f"Feature columns: {feature_cols}")
print(f"Target column: '{target_col}'")

Identified Columns
Feature columns: ['feature_1', 'feature_2', 'feature_3', 'feature_4']
Target column: 'label'


In [None]:
# Class balance
print(f"Class Balance")
train_class_dist = df_train[target_col].value_counts().sort_index()
print("Training set:")
print(train_class_dist)
train_ratio = train_class_dist[1] / train_class_dist[0] if 1 in train_class_dist.index else 0
print(f"Class ratio (1/0): {train_ratio:.4f}")
print(f"Percentage of class 1: {train_class_dist[1]/len(df_train)*100:.2f}%" if 1 in train_class_dist.index else "No class 1")

val_class_dist = df_val[target_col].value_counts().sort_index()
print("\nValidation set:")
print(val_class_dist)
val_ratio = val_class_dist[1] / val_class_dist[0] if 1 in val_class_dist.index else 0
print(f"Class ratio (1/0): {val_ratio:.4f}")
print(f"Percentage of class 1: {val_class_dist[1]/len(df_val)*100:.2f}%" if 1 in val_class_dist.index else "No class 1")

Class Balance
Training set:
label
0    452
1    188
Name: count, dtype: int64
Class ratio (1/0): 0.4159
Percentage of class 1: 29.38%

Validation set:
label
0    113
1     47
Name: count, dtype: int64
Class ratio (1/0): 0.4159
Percentage of class 1: 29.38%


In [None]:
# Feature statistics by class
print(f"Feature Statistics by Class")
for col in feature_cols:
    print(f"\n{col}:")
    print(df_train.groupby(target_col)[col].describe())

# Check for outliers using IQR method
print(f"\nOutlier Detection (IQR Method)")
for col in feature_cols:
    Q1 = df_train[col].quantile(0.25)
    Q3 = df_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df_train[(df_train[col] < lower_bound) | (df_train[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers ({len(outliers)/len(df_train)*100:.2f}%)")

Feature Statistics by Class

feature_1:
       count      mean       std       min       25%       50%       75%  \
label                                                                      
0      452.0 -0.214098  0.915073 -3.204401 -0.812668 -0.217525  0.463115   
1      188.0  0.533266  0.842562 -2.395572 -0.117387  0.474292  1.104832   

            max  
label            
0      2.679910  
1      2.465325  

feature_2:
       count      mean       std       min       25%       50%       75%  \
label                                                                      
0      452.0  5.609801  1.778778  0.333652  4.376415  5.620639  6.759124   
1      188.0  3.499046  1.487541 -1.635338  2.526186  3.600933  4.507545   

             max  
label             
0      10.586332  
1       6.675274  

feature_3:
       count      mean       std       min       25%       50%       75%  \
label                                                                      
0      452.0  0.454609  0.

# Task 3:
Implement a preprocessing pipeline that prepares raw features for modeling, including handling missing values and transforming features where necessary.

In [None]:
# Handling missing values and preparation of features for modeling

df_train_clean = df_train.copy()
df_val_clean = df_val.copy()

# Handle missing values
if df_train_clean[feature_cols].isnull().sum().sum() > 0:
    for col in feature_cols:
        if df_train_clean[col].isnull().sum() > 0:
            df_train_clean[col].fillna(df_train_clean[col].median(), inplace=True)
            df_val_clean[col].fillna(df_train_clean[col].median(), inplace=True)
else:
    print("No missing values found")

No missing values found


In [None]:
# Separate features and target
X_train = df_train_clean[feature_cols]
y_train = df_train_clean[target_col]
X_val = df_val_clean[feature_cols]
y_val = df_val_clean[target_col]

print(f"\nDataset Shapes Before Scaling")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=feature_cols, index=X_val.index)

print("\nScaled feature statistics (training set):")
print(X_train_scaled.describe())



Dataset Shapes Before Scaling
X_train: (640, 4), y_train: (640,)
X_val: (160, 4), y_val: (160,)

Scaled feature statistics (training set):
          feature_1     feature_2     feature_3     feature_4
count  6.400000e+02  6.400000e+02  6.400000e+02  6.400000e+02
mean  -2.220446e-17 -1.776357e-16 -2.012279e-16 -1.332268e-16
std    1.000782e+00  1.000782e+00  1.000782e+00  1.000782e+00
min   -3.358594e+00 -3.398378e+00 -1.611731e+00 -1.509281e+00
25%   -6.499410e-01 -6.533778e-01 -9.080694e-01 -8.363868e-01
50%   -7.972491e-03  4.580766e-03 -1.400543e-02 -1.634923e-01
75%    6.438613e-01  6.661838e-01  8.510156e-01  8.458493e-01
max    2.798413e+00  2.870784e+00  1.753837e+00  1.518744e+00


# Task 4:
Train an initial binary classification model that serves as a baseline and record its performance on the validation set.

In [None]:
# Train initial binary classification model -> Logistic Regression

lr_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    solver='lbfgs',
    random_state=42
)
lr_model.fit(X_train_scaled, y_train)

y_train_pred_lr = lr_model.predict(X_train_scaled)
y_val_pred_lr = lr_model.predict(X_val_scaled)
y_val_pred_proba_lr = lr_model.predict_proba(X_val_scaled)[:, 1]

# Calculate metrics
lr_train_accuracy = accuracy_score(y_train, y_train_pred_lr)
lr_val_accuracy = accuracy_score(y_val, y_val_pred_lr)
lr_precision = precision_score(y_val, y_val_pred_lr, zero_division=0)
lr_recall = recall_score(y_val, y_val_pred_lr, zero_division=0)
lr_f1 = f1_score(y_val, y_val_pred_lr, zero_division=0)
lr_roc_auc = roc_auc_score(y_val, y_val_pred_proba_lr) if len(np.unique(y_val)) > 1 else 0

print("Logistic Regression Performance Metrics")
print(f"Training Accuracy: {lr_train_accuracy:.4f}")
print(f"Validation Accuracy: {lr_val_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall:    {lr_recall:.4f}")
print(f"F1-Score:  {lr_f1:.4f}")
print(f"ROC-AUC:   {lr_roc_auc:.4f}")

print("\nFeature Coefficients")
feature_importance_lr = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': lr_model.coef_[0]
}).sort_values('Coefficient', ascending=False)
print(feature_importance_lr)

print("\nClassification Report")
print(classification_report(y_val, y_val_pred_lr, zero_division=0))

print("\nConfusion Matrix")
cm_lr = confusion_matrix(y_val, y_val_pred_lr)
print(cm_lr)

Logistic Regression Performance Metrics
Training Accuracy: 0.9953
Validation Accuracy: 0.9875
Precision: 0.9592
Recall:    1.0000
F1-Score:  0.9792
ROC-AUC:   1.0000

Feature Coefficients
     Feature  Coefficient
3  feature_4     3.649963
0  feature_1     3.100477
2  feature_3     1.534360
1  feature_2    -4.665562

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       113
           1       0.96      1.00      0.98        47

    accuracy                           0.99       160
   macro avg       0.98      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160


Confusion Matrix
[[111   2]
 [  0  47]]


# Task 5:
Train at least one additional binary classification model with a different learning approach or configuration.

In [None]:
from sklearn.svm import SVC

svm_model = SVC(
    kernel='linear',
    C=1.0,
    probability=True,
    random_state=42
)

svm_model.fit(X_train_scaled, y_train)

In [None]:
y_train_pred_svm = svm_model.predict(X_train_scaled)
y_val_pred_svm = svm_model.predict(X_val_scaled)
y_val_pred_proba_svm = svm_model.predict_proba(X_val_scaled)[:, 1]

In [None]:
svm_train_accuracy = accuracy_score(y_train, y_train_pred_svm)
svm_val_accuracy = accuracy_score(y_val, y_val_pred_svm)
svm_precision = precision_score(y_val, y_val_pred_svm, zero_division=0)
svm_recall = recall_score(y_val, y_val_pred_svm, zero_division=0)
svm_f1 = f1_score(y_val, y_val_pred_svm, zero_division=0)
svm_roc_auc = roc_auc_score(y_val, y_val_pred_proba_svm) if len(np.unique(y_val)) > 1 else 0


In [None]:
print("Support Vector Machine Performance Metrics")
print(f"Training Accuracy: {svm_train_accuracy:.4f}")
print(f"Validation Accuracy: {svm_val_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall:    {svm_recall:.4f}")
print(f"F1-Score:  {svm_f1:.4f}")
print(f"ROC-AUC:   {svm_roc_auc:.4f}")

print("\nClassification Report")
print(classification_report(y_val, y_val_pred_svm, zero_division=0))

print("\nConfusion Matrix")
cm_svm = confusion_matrix(y_val, y_val_pred_svm)
print(cm_svm)


Support Vector Machine Performance Metrics
Training Accuracy: 0.9891
Validation Accuracy: 0.9688
Precision: 0.9038
Recall:    1.0000
F1-Score:  0.9495
ROC-AUC:   1.0000

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       113
           1       0.90      1.00      0.95        47

    accuracy                           0.97       160
   macro avg       0.95      0.98      0.96       160
weighted avg       0.97      0.97      0.97       160


Confusion Matrix
[[108   5]
 [  0  47]]


# Task 6:
Implement a consistent evaluation process using appropriate binary classification metrics and compute results separately for each trained model.

In [None]:
comparison_df = pd.DataFrame({
    "Model": ["Logistic Regression", "SVM (Linear)"],
    "Train Accuracy": [lr_train_accuracy, svm_train_accuracy],
    "Validation Accuracy": [lr_val_accuracy, svm_val_accuracy],
    "Precision": [lr_precision, svm_precision],
    "Recall": [lr_recall, svm_recall],
    "F1-score": [lr_f1, svm_f1],
    "ROC-AUC": [lr_roc_auc, svm_roc_auc]
})

comparison_df

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Precision,Recall,F1-score,ROC-AUC
0,Logistic Regression,0.995313,0.9875,0.959184,1.0,0.979167,1.0
1,SVM (Linear),0.989062,0.96875,0.903846,1.0,0.949495,1.0


# Task 7:
Compare model performances and clearly decide which model should be selected based on evaluation results, not intuition.

### Model Selection

Based on the validation results, Logistic Regression was selected as the final model.
It achieved higher validation accuracy (0.9875), precision (0.9592), and F1-score (0.9792)
compared to the SVM model, while both models achieved perfect recall and ROC-AUC.
Since precision and F1-score are critical in spam detection to minimize false positives,
Logistic Regression was chosen based strictly on evaluation metrics rather than intuition.

In [None]:
best_model = comparison_df.sort_values("F1-score", ascending=False).iloc[0]
print("Selected Model Based on Validation Metrics:")
print(best_model)

Selected Model Based on Validation Metrics:
Model                  Logistic Regression
Train Accuracy                    0.995313
Validation Accuracy                 0.9875
Precision                         0.959184
Recall                                 1.0
F1-score                          0.979167
ROC-AUC                                1.0
Name: 0, dtype: object


# Task 8:
Save the selected model together with all preprocessing steps so that predictions can be made later without retraining.

In [None]:
# We need to switch to imputer as missing value handler, so we can save medians
imputer = SimpleImputer(strategy='median').fit(X_train)

pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('model', lr_model)
])

joblib.dump(pipeline, 'logistic_model_pipeline.pkl')

['logistic_model_pipeline.pkl']

# Task 9:
Reload the saved model and verify that it produces valid predictions on unseen validation data.

In [None]:
pipeline = joblib.load('logistic_model_pipeline.pkl')

X = df_val.drop('label', axis=1)
y = df_val['label']
y_pred = pipeline.predict(X)

# Calculate metrics
pipeline_accuracy = accuracy_score(y, y_pred)
pipeline_precision = precision_score(y, y_pred, zero_division=0)
pipeline_recall = recall_score(y, y_pred, zero_division=0)
pipeline_f1 = f1_score(y, y_pred, zero_division=0)

print("Loaded Model Performance Metrics")
print(f"Validation Accuracy: {pipeline_accuracy:.4f}")
print(f"Precision: {pipeline_precision:.4f}")
print(f"Recall:    {pipeline_recall:.4f}")
print(f"F1-Score:  {pipeline_f1:.4f}")
print()
print("Original Model Performance Metrics")
print(f"Validation Accuracy: {lr_val_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall:    {lr_recall:.4f}")
print(f"F1-Score:  {lr_f1:.4f}")

# As we can see, all scores are the same

Loaded Model Performance Metrics
Validation Accuracy: 0.9875
Precision: 0.9592
Recall:    1.0000
F1-Score:  0.9792

Original Model Performance Metrics
Validation Accuracy: 0.9875
Precision: 0.9592
Recall:    1.0000
F1-Score:  0.9792
