### Import packages and cleaned dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
supplyChain = pd.read_csv("../../data/raw/Q1_2015.csv", encoding='ISO-8859-1')
# future_data = pd.read_csv("../../data/raw/future_data.csv", encoding='ISO-8859-1')

### Data Preprocessing

In [3]:
supplyChain['order date']= pd.to_datetime(supplyChain['order date (DateOrders)'])
supplyChain['shipping date']= pd.to_datetime(supplyChain['shipping date (DateOrders)'])
supplyChain['order year']=supplyChain['order date'].dt.year
supplyChain['order month']=supplyChain['order date'].dt.month
supplyChain['order day']=supplyChain['order date'].dt.day
supplyChain['order hour']=supplyChain['order date'].dt.hour
supplyChain['order minute']=supplyChain['order date'].dt.minute

supplyChain['shipping year']=supplyChain['shipping date'].dt.year
supplyChain['shipping month']=supplyChain['shipping date'].dt.month
supplyChain['shipping day']=supplyChain['shipping date'].dt.day
supplyChain['shipping hour']=supplyChain['shipping date'].dt.hour
supplyChain['shipping minute']=supplyChain['shipping date'].dt.minute

### Rebalance Dataset

In [4]:
data_n=supplyChain.loc[:,['Type','Days for shipment (scheduled)','order year','order month','order day','order hour','order minute','Benefit per order','Category Name','Latitude','Longitude','Customer Segment','Department Name','Market','Order City','Order Country','Order Item Discount','Order Item Product Price','Order Item Quantity','Order Item Total','Order State','Product Name','shipping year','shipping month','shipping day','shipping hour','shipping minute','Shipping Mode','Late_delivery_risk','Order Status']]
# data_n.info()

data_n['Order Status']= [0 if i!='SUSPECTED_FRAUD' else 1 for i in data_n['Order Status']]

from sklearn.preprocessing import LabelEncoder

enc=LabelEncoder()
for i in data_n.columns:
    if data_n[i].dtype=='object':
        data_n[i]=enc.fit_transform(data_n[i])

In [5]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

y=data_n['Order Status']
X=data_n.drop(['Order Status'],axis=1)
name = X.columns
X=StandardScaler().fit_transform(X)
X = pd.DataFrame(X, columns=name)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Apply SMOTE to generate synthetic samples to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [6]:
# y_resampled.value_counts()

### Feature Selection

In [7]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
# from sklearn.model_selection import train_test_split

mlflow.set_experiment('Feature_Selection_with_RFE')

with mlflow.start_run():
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=rf_classifier, n_features_to_select=15, step=1)
    rfe.fit(X_train, y_train)
    
    # Transform the data
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)
    
    # Train a new classifier on the transformed data
    rf_classifier.fit(X_train_rfe, y_train)
    
    # Evaluate the model
    score = rf_classifier.score(X_test_rfe, y_test)
    print(f"Model score after RFE: {score:.4f}")
    
    # Log parameters, metrics, and model
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("n_features_to_select", 15)
    mlflow.log_metric("accuracy", score)
    mlflow.sklearn.log_model(rf_classifier, "model")
    
    # Get and log the ranking of the features
    ranking = rfe.ranking_
    print(f"Feature ranking: {ranking}")
    mlflow.log_param("feature_ranking", ranking.tolist())

# To view the experiments, run the MLflow UI in terminal:
# mlflow ui


Model score after RFE: 0.9961
Feature ranking: [ 1  4 15 10  1  1  1  1  7  1  1  9 11 14  1  1  3  6 12  2  1  8 13  5
  1  1  1  1  1]




In [8]:
selected_features = name[rfe.support_]

In [26]:
selected_features

Index(['Type', 'order day', 'order hour', 'order minute', 'Benefit per order',
       'Latitude', 'Longitude', 'Order City', 'Order Country', 'Order State',
       'shipping day', 'shipping hour', 'shipping minute', 'Shipping Mode',
       'Late_delivery_risk'],
      dtype='object')

In [10]:
X_resampled_sel = X_resampled[selected_features.tolist()]
X_test_sel =  X_test[selected_features.tolist()]

In [11]:
train_resampled = pd.merge(X_resampled_sel, y_resampled, left_index=True, right_index=True)
# train_resampled

In [12]:
test_sel = pd.merge(X_test_sel, y_test, left_index=True, right_index=True)
# test_sel

In [21]:
resampled_2015Q1 = pd.concat([train_resampled, test_sel], axis=0)
resampled_2015Q1['Order Status'] = resampled_2015Q1['Order Status'].astype('category')

In [24]:
# resampled_2015Q1['Order Status'].value_counts()

Order Status
0    14878
1    11257
Name: count, dtype: int64

### To Pipeline

In [4]:
from pathlib import Path
DATA_PATH = Path() / "../../data/processed"
DATA_PATH.mkdir(parents=True,exist_ok=True)

def save_csv(data, filename, data_path=DATA_PATH, encoding='ISO-8859-1'):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False, encoding=encoding)

In [5]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



class DateConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['order date'] = pd.to_datetime(X['order date (DateOrders)'])
        X['shipping date'] = pd.to_datetime(X['shipping date (DateOrders)'])
        return X

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for feature in ['order date', 'shipping date']:
            prefix = feature.split()[0]
            X[f'{prefix} year'] = X[feature].dt.year
            X[f'{prefix} month'] = X[feature].dt.month
            X[f'{prefix} day'] = X[feature].dt.day
            X[f'{prefix} hour'] = X[feature].dt.hour
            X[f'{prefix} minute'] = X[feature].dt.minute
        X_n = X.loc[:,['Type','Days for shipment (scheduled)','order year','order month','order day','order hour',
                        'order minute','Benefit per order','Category Name','Latitude','Longitude','Customer Segment',
                        'Department Name','Market','Order City','Order Country','Order Item Discount','Order Item Product Price',
                        'Order Item Quantity','Order Item Total','Order State','Product Name','shipping year','shipping month',
                        'shipping day','shipping hour','shipping minute','Shipping Mode','Late_delivery_risk','Order Status']]
        return X_n

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
        
    def fit(self, X, y=None):
        for col in X.columns:
            if X[col].dtype == 'object':
                le = LabelEncoder()
                le.fit(X[col])
                self.encoders[col] = le
        return self
    
    def transform(self, X):
        X = X.copy()
        for col, encoder in self.encoders.items():
            if col in X.columns:
                X[col] = encoder.transform(X[col])
        return X
    
class DataFrameConverter(BaseEstimator, TransformerMixin):
    def __init__(self, column_names):
        self.column_names = column_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.column_names)

def build_pipeline():
    date_cols = ['order date (DateOrders)', 'shipping date (DateOrders)']
    
    all_cols = ['Type','Days for shipment (scheduled)','order year','order month','order day','order hour',
                        'order minute','Benefit per order','Category Name','Latitude','Longitude','Customer Segment',
                        'Department Name','Market','Order City','Order Country','Order Item Discount','Order Item Product Price',
                        'Order Item Quantity','Order Item Total','Order State','Product Name','shipping year','shipping month',
                        'shipping day','shipping hour','shipping minute','Shipping Mode','Late_delivery_risk','Order Status']


    pipeline = ImbPipeline(steps=[
        ('date_converter', DateConverter()),
        ('feature_engineering', FeatureEngineering()),
        ('encode_categorical', CategoricalEncoder()),
        ('scaler', StandardScaler()),
        ('to_dataframe', DataFrameConverter(column_names=all_cols))  
    ])
    
    return pipeline

def apply_smote(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

def select_and_concatenate_datasets(X_resampled, y_resampled, X_test, y_test):
    selected_features = ['Type', 'order day', 'order hour', 'order minute', 'Benefit per order', 'Latitude', 'Longitude', 'Order City', 'Order Country', 'Order State', 'shipping day', 'shipping hour', 'shipping minute', 'Shipping Mode', 'Late_delivery_risk']
    X_resampled_sel = X_resampled[selected_features]
    X_test_sel = X_test[selected_features]
    train_resampled = pd.concat([X_resampled_sel, y_resampled], axis=1)
    test_sel = pd.concat([X_test_sel, y_test], axis=1)
    concatenated_datasets = pd.concat([train_resampled, test_sel])
    return concatenated_datasets

def process_data_from_csv(filepath):
    # Read CSV file
    df = pd.read_csv(filepath, encoding='ISO-8859-1')
    
    # Build and apply the preprocessing pipeline
    pipeline = build_pipeline()
    df_processed = pipeline.fit_transform(df)
    
    # Prepare target and features
    y = df['Order Status'].map(lambda x: 0 if x != 'SUSPECTED_FRAUD' else 1)
    X = df_processed.drop('Order Status', axis=1)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # Apply SMOTE to the training data
    X_resampled, y_resampled = apply_smote(X_train, y_train)
    
    # Concatenate datasets for the final dataset
    final_dataset = select_and_concatenate_datasets(X_resampled, y_resampled, X_test, y_test)
    
    return final_dataset


In [6]:
save_csv(process_data_from_csv("../../data/raw/Q1_2015.csv"), 'Q1_2015_fraud.csv')

In [7]:
# final_dataset['Order Status'].value_counts()

Order Status
0    14878
1    11257
Name: count, dtype: int64

In [8]:
save_csv(process_data_from_csv("../../data/raw/future_data.csv"), 'future_data_fraud.csv')