In [24]:
import pandas as pd
import openpyxl
import os
import tslearn

file_path = os.path.join("..", "data", "2021 Full Merch Coords and Fuel Price with Flags.csv")
data = pd.read_csv(file_path, low_memory=False)

In [9]:
data.columns

Index(['Transaction Date', 'REG_NUM', 'Merchant Name', 'Purchase Category',
       'No. of Litres', 'Transaction Amount', 'Make', 'Model',
       'Make and Model', 'Site', 'District', 'Category', 'Rental',
       'Merchant Lat', 'Merchant Long', 'Site Lat', 'Site Long', 'Fuel Type',
       'Actual Fuel Price', 'Actual Fuel Price Inland',
       'Estimated Price Per Litre', 'Month Name', 'Weekday Name', 'YearMonth',
       'AggClusterLabels', 'TransKmeansCluster', 'Average_Category_Amount',
       'Transaction_Amount_Flag', 'Days_Between_Transactions',
       'Transaction_Frequency_Flag', 'Fuel_Price_Flag', 'Flag', 'Reason'],
      dtype='object')

In [25]:
data['Reason'].value_counts()

Reason
PPL                         108517
Clear                        55274
PPL + Frequency              25199
PPL + Amount                  4077
Frequency                     2464
PPL + Frequency + Amount      1338
Amount                         255
Frequency + Amount              52
Name: count, dtype: int64

# Create the train and test sets

In [26]:
# Select the columns from the data that we want to use
data = data[['Transaction Amount', 'No. of Litres', 'District', 'Make and Model', 'Fuel Type', 'Flag', 'Category']]

# Example of replacing spaces with underscores in the 'Make' column
data['Make and Model'] = data['Make and Model'].str.replace(' ', '_')

In [27]:
data['Flag'].value_counts()

Flag
True     141902
False     55274
Name: count, dtype: int64

# Linear Support Vector Machine

In [28]:
# 1. Import Libraries
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

# 2. Data Preprocessing
# Define categorical and numerical features
categorical_features = ['District', 'Make and Model', 'Fuel Type', 'Category']
numerical_features = ['Transaction Amount', 'No. of Litres']

# Define transformations for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split data into features and target variable
X = data.drop(['Flag'], axis=1)
y = data['Flag']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define LinearSVC with specific parameters
linear_svc = LinearSVC(max_iter=10000, dual="auto", class_weight='balanced', random_state=1)

# Create a calibrated classifier with LinearSVC
calibrated_svc = CalibratedClassifierCV(estimator=linear_svc, method='sigmoid', cv=5)

# Create a pipeline with the calibrated classifier
pipeline_svm = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', calibrated_svc)
])

# Fit the model
pipeline_svm.fit(X_train, y_train)

# 4. Model Evaluation
# Predictions
y_pred = pipeline_svm.predict(X_test)


In [38]:

# Evaluation
print("Classification Report for Linear SVM:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report for Linear SVM:
               precision    recall  f1-score   support

       False       0.95      0.97      0.96     10900
        True       0.99      0.98      0.98     28536

    accuracy                           0.98     39436
   macro avg       0.97      0.98      0.97     39436
weighted avg       0.98      0.98      0.98     39436

Confusion Matrix:
 [[10574   326]
 [  528 28008]]


# Naive Bayes Classifier

In [30]:
# 1. Import Libraries for Naive Bayes
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import TransformerMixin

# Define a custom transformer to convert sparse matrix to dense
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()


# Preprocessors for numerical and categorical features for Naive Bayes
numeric_transformer_nb = Pipeline(steps=[
    ('scaler_nb', StandardScaler())
])

categorical_transformer_nb = Pipeline(steps=[
    ('onehot_nb', OneHotEncoder(handle_unknown='ignore')),
    ('to_dense_nb', DenseTransformer())  # Convert to dense
])

# Combine preprocessors for Naive Bayes
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('num_nb', numeric_transformer_nb, numerical_features),
        ('cat_nb', categorical_transformer_nb, categorical_features)
    ])

# Split data into features and target variable for Naive Bayes
X_nb = data.drop('Flag', axis=1)
y_nb = data['Flag']

# Split data into training and testing sets for Naive Bayes
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_nb, y_nb, test_size=0.2, random_state=1)

# 3. Model Training for Naive Bayes
# Create a pipeline for Naive Bayes - switch out to use CategoricalNB for categorical features (more efficient)
pipeline_nb = make_pipeline(
    preprocessor_nb,
    # GaussianNB()  # Using GaussianNB for numerical features (swap out for CategoricalNB since categorical features)
)

# Fit the model for Naive Bayes
pipeline_nb.fit(X_train_nb, y_train_nb)

# 4. Model Evaluation for Naive Bayes
# Predictions for Naive Bayes
y_pred_nb = pipeline_nb.predict(X_test_nb)

# Evaluation metrics for Naive Bayes
print("Classification Report for Naive Bayes:\n", classification_report(y_test_nb, y_pred_nb))
print("Confusion Matrix for Naive Bayes:\n", confusion_matrix(y_test_nb, y_pred_nb))


Classification Report for Naive Bayes:
               precision    recall  f1-score   support

       False       0.95      0.97      0.96     10900
        True       0.99      0.98      0.98     28536

    accuracy                           0.98     39436
   macro avg       0.97      0.98      0.97     39436
weighted avg       0.98      0.98      0.98     39436

Confusion Matrix for Naive Bayes:
 [[10563   337]
 [  531 28005]]


# XGBoost Classifier

In [31]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Define transformations for categorical and numerical features
preprocessor_xgb = ColumnTransformer(
    transformers=[
        ('num_xgb', StandardScaler(), numerical_features),
        ('cat_xgb', OneHotEncoder(), categorical_features)
    ])

# Split data into features and target variable
X_xgb = data.drop('Flag', axis=1)
y_xgb = data['Flag']

# Split data into training and testing sets
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(X_xgb, y_xgb, test_size=0.2, random_state=1)

# Calculate the scale_pos_weight value
scale_pos_weight = sum(y_train_xgb == 0) / sum(y_test_xgb == 1)

# 3. Model Training and Hyperparameter Tuning
# Create a pipeline
pipeline_xgb = Pipeline([
    ('preprocessor_xgb', preprocessor_xgb),
    ('classifier_xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight))
])

# Optional: Define parameters for GridSearchCV
param_grid_xgb = {
    'classifier_xgb__n_estimators': [50, 100, 150, 200, 300],  # Number of trees
    'classifier_xgb__learning_rate': [0.001, 0.01, 0.1, 0.5],  # Learning rate
}

# Optional: Create GridSearchCV object
grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, verbose=1, n_jobs=-1)

# Fit the model (use grid_search_xgb.fit(X_train_xgb, y_train_xgb) if using GridSearchCV)
pipeline_xgb.fit(X_train_xgb, y_train_xgb)

# 4. Model Evaluation
# Predictions (use grid_search_xgb.predict(X_test_xgb) if using GridSearchCV)
y_pred_xgb = pipeline_xgb.predict(X_test_xgb)

# Evaluation metrics
print("Classification Report for XGBoost:\n", classification_report(y_test_xgb, y_pred_xgb))
print("Confusion Matrix for XGBoost:\n", confusion_matrix(y_test_xgb, y_pred_xgb))

Classification Report for XGBoost:
               precision    recall  f1-score   support

       False       0.96      0.97      0.96     10900
        True       0.99      0.98      0.99     28536

    accuracy                           0.98     39436
   macro avg       0.97      0.98      0.98     39436
weighted avg       0.98      0.98      0.98     39436

Confusion Matrix for XGBoost:
 [[10565   335]
 [  454 28082]]


# Logistic Regression

In [32]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Define transformations for categorical and numerical features
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num_lr', StandardScaler(), numerical_features),
        ('cat_lr', OneHotEncoder(), categorical_features)
    ])

# Split data into features and target variable
X_lr = data.drop('Flag', axis=1)
y_lr = data['Flag']

# Split data into training and testing sets
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size=0.2, random_state=1)

# 3. Model Training
# Create a pipeline with an increased max_iter
pipeline_lr = Pipeline([
    ('preprocessor_lr', preprocessor_lr),
    ('classifier_lr', LogisticRegression(max_iter=1000, class_weight='balanced'))  # Increased max_iter
])

# Fit the model
pipeline_lr.fit(X_train_lr, y_train_lr)

# 4. Model Evaluation
# Predictions
y_pred_lr = pipeline_lr.predict(X_test_lr)

# Evaluation metrics
print("Classification Report for Logistic Regression:\n", classification_report(y_test_lr, y_pred_lr))
print("Confusion Matrix for Logistic Regression:\n", confusion_matrix(y_test_lr, y_pred_lr))


Classification Report for Logistic Regression:
               precision    recall  f1-score   support

       False       0.95      0.97      0.96     10900
        True       0.99      0.98      0.99     28536

    accuracy                           0.98     39436
   macro avg       0.97      0.98      0.97     39436
weighted avg       0.98      0.98      0.98     39436

Confusion Matrix for Logistic Regression:
 [[10575   325]
 [  522 28014]]


# Generating figures

In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [33]:
def plot_confusion_matrix(y_true, y_pred, model_name, dpi=300):
    matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 8))
    sns.heatmap(matrix, annot=True, fmt='g', cmap='viridis')
    #plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted Label', size=13)
    plt.ylabel('True Label', size=13)
    plt.savefig(f'../final_plots/modelling/{model_name}_confusion_matrix.pdf', format='pdf', dpi=dpi)
    plt.close()

def plot_roc_curve(y_true, y_scores, model_name, dpi=300):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', size=13)
    plt.ylabel('True Positive Rate', size=13)
    #plt.title(f'ROC Curve for {model_name}')
    plt.legend(loc="lower right")
    plt.savefig(f'../final_plots/modelling/{model_name}_roc_curve.pdf', format='pdf', dpi=dpi)
    plt.close()



In [34]:
y_scores_lr = pipeline_lr.predict_proba(X_test_lr)[:, 1]
plot_confusion_matrix(y_test, y_pred_lr, "Logistic_Regression")
plot_roc_curve(y_test, y_scores_lr, "Logistic_Regression")

In [37]:
y_scores_nb = pipeline_nb.predict_proba(X_test_nb)[:, 1]
plot_confusion_matrix(y_test, y_pred_nb, "Naive Bayes")
plot_roc_curve(y_test, y_scores_nb, "Naive Bayes")

In [35]:
y_scores_svm = pipeline_svm.predict_proba(X_test)[:, 1]
plot_confusion_matrix(y_test, y_pred, "Linear_SVM")
plot_roc_curve(y_test, y_scores_svm, "Linear_SVM")

In [36]:
y_scores_xgb = pipeline_xgb.predict_proba(X_test_xgb)[:, 1]
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
plot_roc_curve(y_test, y_scores_xgb, "XGBoost")