In [7]:
import pandas as pd
import openpyxl
import os
import tslearn

file_path = os.path.join("..", "data", "Final Transactions With Flags.csv")
data_in = pd.read_csv(file_path, low_memory=False)

In [8]:
data_in.columns

Index(['Transaction Date', 'REG_NUM', 'Merchant Name', 'No. of Litres',
       'Transaction Amount', 'VEHICLE MAKE', 'MODEL DERIVATIVE', 'DEPARTMENT',
       'RATE CARD CATEGORY', 'Site', 'District', 'Site Lat', 'Site Long',
       'Merchant Lat', 'Merchant Long', 'Fuel Type',
       'Estimated Price Per Litre', 'Coastal Petrol', 'Inland Petrol',
       'Coastal Diesel', 'Inland Diesel', 'Month Name', 'Weekday Name',
       'Average_Category_Amount', 'Transaction_Amount_Flag',
       'Days_Between_Transactions', 'Transaction_Frequency_Flag',
       'Coastal Diesel Adjusted', 'Price Difference', 'Fuel_Price_Flag',
       'Number_of_Flags'],
      dtype='object')

In [9]:
data_in['Number_of_Flags'].value_counts()

Number_of_Flags
0    140110
1     33707
2      2103
3        13
Name: count, dtype: int64

# Create the train and test sets

In [10]:
# Select the columns from the data that we want to use
data = data_in[['Transaction Amount', 'No. of Litres', 'District', 'VEHICLE MAKE', 'Fuel Type', 'Number_of_Flags', 'RATE CARD CATEGORY']]

# Linear Support Vector Machine

In [11]:
# 1. Import Libraries
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit

# 2. Data Preprocessing
# Define categorical and numerical features
categorical_features = ['District', 'VEHICLE MAKE', 'Fuel Type', 'RATE CARD CATEGORY']
numerical_features = ['Transaction Amount', 'No. of Litres']

# Define transformations for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split data into features and target variable
X = data.drop(['Number_of_Flags'], axis=1)
y = data['Number_of_Flags']

# Split data into training and testing sets using stratified sampling
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
train_index, test_index = next(splitter.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Define LinearSVC with specific parameters
linear_svc = LinearSVC(max_iter=10000, dual="auto", class_weight='balanced', random_state=1)

# Create a calibrated classifier with LinearSVC
calibrated_svc = CalibratedClassifierCV(estimator=linear_svc, method='sigmoid', cv=5)

# Create a pipeline with the calibrated classifier
pipeline_svm = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', calibrated_svc)
])

# Fit the model
pipeline_svm.fit(X_train, y_train)

# 4. Model Evaluation
# Predictions
y_pred = pipeline_svm.predict(X_test)


In [12]:

# Evaluation
print("Classification Report for Linear SVM:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report for Linear SVM:
               precision    recall  f1-score   support

           0       0.84      0.96      0.90     28022
           1       0.56      0.25      0.34      6741
           2       0.00      0.00      0.00       421
           3       0.00      0.00      0.00         3

    accuracy                           0.81     35187
   macro avg       0.35      0.30      0.31     35187
weighted avg       0.77      0.81      0.78     35187

Confusion Matrix:
 [[26961  1061     0     0]
 [ 5071  1670     0     0]
 [  177   244     0     0]
 [    1     2     0     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes Classifier

In [13]:
# 1. Import Libraries for Naive Bayes
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import TransformerMixin

# Define a custom transformer to convert sparse matrix to dense
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()


# Preprocessors for numerical and categorical features for Naive Bayes
numeric_transformer_nb = Pipeline(steps=[
    ('scaler_nb', StandardScaler())
])

categorical_transformer_nb = Pipeline(steps=[
    ('onehot_nb', OneHotEncoder(handle_unknown='ignore')),
    ('to_dense_nb', DenseTransformer())  # Convert to dense
])

# Combine preprocessors for Naive Bayes
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('num_nb', numeric_transformer_nb, numerical_features),
        ('cat_nb', categorical_transformer_nb, categorical_features)
    ])

# Split data into training and testing sets using stratified sampling
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
train_index, test_index = next(splitter.split(X, y))
X_train_nb, X_test_nb = X.iloc[train_index], X.iloc[test_index]
y_train_nb, y_test_nb = y.iloc[train_index], y.iloc[test_index]

# 3. Model Training for Naive Bayes
# Create a pipeline for Naive Bayes - switch out to use CategoricalNB for categorical features (more efficient)
pipeline_nb = make_pipeline(
    preprocessor_nb,
    GaussianNB()  
)

# Fit the model for Naive Bayes
pipeline_nb.fit(X_train_nb, y_train_nb)

# 4. Model Evaluation for Naive Bayes
# Predictions for Naive Bayes
y_pred_nb = pipeline_nb.predict(X_test_nb)

# Evaluation metrics for Naive Bayes
print("Classification Report for Naive Bayes:\n", classification_report(y_test_nb, y_pred_nb))
print("Confusion Matrix for Naive Bayes:\n", confusion_matrix(y_test_nb, y_pred_nb))


Classification Report for Naive Bayes:
               precision    recall  f1-score   support

           0       0.94      0.14      0.24     28022
           1       0.26      0.02      0.03      6741
           2       0.01      0.56      0.02       421
           3       0.00      1.00      0.00         3

    accuracy                           0.12     35187
   macro avg       0.30      0.43      0.07     35187
weighted avg       0.80      0.12      0.20     35187

Confusion Matrix for Naive Bayes:
 [[ 3893   345 19376  4408]
 [  252   120  4616  1753]
 [    4     3   236   178]
 [    0     0     0     3]]


# XGBoost Classifier

In [14]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Define transformations for categorical and numerical features
preprocessor_xgb = ColumnTransformer(
    transformers=[
        ('num_xgb', StandardScaler(), numerical_features),
        ('cat_xgb', OneHotEncoder(), categorical_features)
    ])

# Split data into training and testing sets using stratified sampling
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
train_index, test_index = next(splitter.split(X, y))
X_train_xgb, X_test_xgb = X.iloc[train_index], X.iloc[test_index]
y_train_xgb, y_test_xgb = y.iloc[train_index], y.iloc[test_index]

# 3. Model Training and Hyperparameter Tuning
# Create a pipeline
pipeline_xgb = Pipeline([
    ('preprocessor_xgb', preprocessor_xgb),
    ('classifier_xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Optional: Define parameters for GridSearchCV
param_grid_xgb = {
    'classifier_xgb__n_estimators': [50, 100, 150, 200, 300],  # Number of trees
    'classifier_xgb__learning_rate': [0.001, 0.01, 0.1, 0.5],  # Learning rate
}

# Optional: Create GridSearchCV object
grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, verbose=1, n_jobs=-1)

# Fit the model (use grid_search_xgb.fit(X_train_xgb, y_train_xgb) if using GridSearchCV)
pipeline_xgb.fit(X_train_xgb, y_train_xgb)

# 4. Model Evaluation
# Predictions (use grid_search_xgb.predict(X_test_xgb) if using GridSearchCV)
y_pred_xgb = pipeline_xgb.predict(X_test_xgb)

# Evaluation metrics
print("Classification Report for XGBoost:\n", classification_report(y_test_xgb, y_pred_xgb))
print("Confusion Matrix for XGBoost:\n", confusion_matrix(y_test_xgb, y_pred_xgb))

Classification Report for XGBoost:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91     28022
           1       0.62      0.43      0.51      6741
           2       0.58      0.15      0.24       421
           3       0.00      0.00      0.00         3

    accuracy                           0.84     35187
   macro avg       0.52      0.38      0.41     35187
weighted avg       0.82      0.84      0.82     35187

Confusion Matrix for XGBoost:
 [[26533  1477    12     0]
 [ 3819  2891    31     0]
 [   98   261    62     0]
 [    0     2     1     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Generating figures

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [25]:
def plot_confusion_matrix(y_true, y_pred, model_name, dpi=300):
    matrix = confusion_matrix(y_true, y_pred)
    
    # Calculate the total number of samples
    total = matrix.sum()
    
    # Calculate the percentage for each cell in the confusion matrix
    matrix_percent = matrix / total * 100
    
    # Create a text annotation matrix for displaying both values and percentages
    annot_matrix = [[f"{value}\n({percent:.2f}%)" for value, percent in zip(row, row_percent)]
                    for row, row_percent in zip(matrix, matrix_percent)]
    
    plt.figure(figsize=(8, 8))
    sns.heatmap(matrix, annot=annot_matrix, fmt='', cmap='cividis', square=True, cbar=False)
    
    plt.xlabel('Predicted Label', size=13)
    plt.ylabel('True Label', size=13)
    
    # Add a title with the model name
    #plt.title(f'Confusion Matrix for {model_name}', size=16)
    
    # Save the plot as a PDF file
    plt.tight_layout()
    plt.savefig(f'../plots/modelling/{model_name}_confusion_matrix.pdf', format='pdf', dpi=dpi)
    plt.close()

In [22]:
plot_confusion_matrix(y_test, y_pred_nb, "Naive Bayes")

In [23]:
plot_confusion_matrix(y_test, y_pred, "Linear_SVM")

In [24]:
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")