# Isolation Forest 
On Walmart's weekly data

## Import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import datetime 
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest


import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
base_folder = 'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline'
dataset_name = 'Walmart_Weekly'
train_file = base_folder + '/train/' + dataset_name +'_train.csv'
inference_file = base_folder + '/inference/' + dataset_name +'_inference.csv'

current_date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_file_path = base_folder + '/xgboostADmodel_' + dataset_name + current_date_time + '.pkl'

inference_results_file = base_folder + '/inference/' + dataset_name +'_inference_results.csv'

target = 'Weekly_Sales'
# target = 'Daily_Sales'

# Group the data by 'State' and perform lag shifting within each group
groupby_cols=['State'] 

# Define lag columns to consider
lag_columns = ['Weekly_Sales', 'Temperature']#, 'Fuel_Price', 'CPI', 'Unemployment']

# Define the lag values to be used
lags = [1, 2, 4]  # Example lag values of 1 week and 2 weeks

state_encoder = 'label_encoder_State.pkl'

In [3]:
data_path = train_file
data_path

'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline/train/Walmart_Weekly_train.csv'

## Load Dataset

In [4]:
def read(data_path, sheet_name = '', usecols = None):
    df = pd.DataFrame()
    if data_path.split('.')[-1] == 'xlsx':
        if sheet_name:
            df = pd.read_excel(data_path, sheet_name=sheet_name, usecols=usecols)
        else:
            df = pd.read_excel(data_path, usecols=usecols)
        print("Shape of the data in file {} is {}".format(data_path, df.shape))
    else:
        try:
            df = pd.read_csv(data_path)
            print("Shape of the data in file {} is {}".format(data_path, df.shape))
            if df.shape[0] == 0:
                print("No data in file {}".format(data_path))
        except Exception as e:
            print("Issue while reading data at {} \n{}".format(data_path, e))
    return df


def standardize_date_col(dataframe, date_col):
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], infer_datetime_format=True) #.fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce')) 
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], format='%Y-%m-%d') # , errors='coerce')#.fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce'))
    # Convert all dates to 'mm-dd-yyyy' format
    # dataframe[date_col] = dataframe[date_col].dt.strftime('%Y-%m-%d')
    return dataframe

     

In [5]:
# Read data from csv or excel, sheet_name is the sheet in excel that contians data 
data = read(data_path, sheet_name= 'RAW')
data = standardize_date_col(data, 'Date')
data.head(3)

Shape of the data in file C:\Geeta\learning\projects\AnomalyDetectionSXM\Notebooks\Datasets\Pipeline/train/Walmart_Weekly_train.csv is (710, 11)


Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,State
0,2021-02-05,10397622.73,37.2,2.58,200.61,7.55,0,0,11573691.83,9513064.59,Florida
1,2021-02-12,10378496.65,36.72,2.55,200.74,7.55,1,0,11032180.28,8971553.04,Florida
2,2021-02-19,10060556.61,39.7,2.52,200.79,7.55,0,0,10763335.98,8702708.74,Florida


In [6]:
# Sort the data by the 'Date' column in ascending order
data = data.sort_values('Date')
data.reset_index(inplace=True, drop=True)

In [7]:
data.sample(3)

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,State
393,2022-08-05,10133116.06,86.07,3.67,204.89,7.1,0,0,10538264.31,8477637.06,Florida
103,2021-06-25,11779127.14,74.17,2.84,158.09,7.85,0,0,13445998.4,10595130.62,Ohio
371,2022-07-08,7355230.03,83.76,3.6,166.62,9.29,0,0,7845405.58,6493961.11,Texas


## Preprocessing

group the dataset by the 'State' column. 
normalize the data and perform feature selection. 
create lagged features for 'Weekly_sales' and 
encoding the 'State' column

In [8]:
data.shape

(710, 11)

In [9]:
def preprocess_date(df):
    # Convert 'Date' to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Extract year, month and day from 'Date'
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    # Sort DataFrame by date
    df.sort_values('Date', inplace=True)

    # Drop 'Date' column as it's no longer needed
    df = df.drop('Date', axis=1)
    return df

In [10]:

# Group by 'State' column
grouped = data.groupby('State')

# Initialize a scaler and an encoder
scaler = MinMaxScaler()
encoder = LabelEncoder()

processed_dfs = []

for name, group in grouped:
    # Separate target variable 'Anomaly'
    target = group['Anomaly']
    group = group.drop('Anomaly', axis=1)

    # Normalize the group
    numeric_cols = group.select_dtypes(include=['float64', 'int64']).columns
    group[numeric_cols] = scaler.fit_transform(group[numeric_cols])
    
    # Select K best features
    selector = SelectKBest(score_func=f_classif, k='all')
    selected_features = selector.fit_transform(group[numeric_cols], target)

    # Concatenate selected numeric features and rejected non-numeric features
    group = pd.concat([pd.DataFrame(selected_features, columns=numeric_cols, index=group.index), group.drop(columns=numeric_cols)], axis=1)
    
    
    #Re-assigning 'Anomaly' to group
    group['Anomaly'] = target
    
    # Create lagged features
    # for i in range(52, 0, -1):
    #     group['Weekly_sales_lag_'+str(i)] = group['Weekly_Sales'].shift(i)

    # # Handle any remaining NaN values
    # group = group.dropna()
        
    # Append the result to the list
    processed_dfs.append(group)

# Concatenate all processed dfs
data_preprocessed = pd.concat(processed_dfs)

# Apply label encoding to 'State' column
data_preprocessed['State'] = encoder.fit_transform(data['State'])

data_preprocessed = preprocess_date(data_preprocessed)

data_preprocessed.dropna(inplace=True)
data_preprocessed.reset_index(inplace=True, drop=True)

data_preprocessed.shape

(710, 13)

In [11]:
data_preprocessed.head(5)

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Sales_Amount_Upper,Sales_Amount_Lower,State,Anomaly,Year,Month,Day
0,0.268235,0.078761,0.031008,0.009421,1.0,0.0,0.574794,0.574794,1,0,2021,2,5
1,0.180209,0.109309,0.037594,0.029213,1.0,0.0,0.257969,0.257969,3,0,2021,2,5
2,0.23326,0.079661,0.043478,0.052101,1.0,0.0,0.514473,0.514473,0,0,2021,2,5
3,0.356518,0.098028,0.037037,0.019837,1.0,0.0,0.792051,0.792051,0,0,2021,2,5
4,0.24236,0.038777,0.058394,0.0,1.0,0.0,0.463823,0.463823,2,0,2021,2,5


In [12]:
data_preprocessed.columns

Index(['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'Holiday_Flag', 'Sales_Amount_Upper', 'Sales_Amount_Lower', 'State',
       'Anomaly', 'Year', 'Month', 'Day'],
      dtype='object')

In [13]:
data_preprocessed.drop(columns=['Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Holiday_Flag'], inplace=True)
data_preprocessed.columns

Index(['Weekly_Sales', 'Sales_Amount_Upper', 'Sales_Amount_Lower', 'State',
       'Anomaly', 'Year', 'Month', 'Day'],
      dtype='object')

### Split Dataset

In [14]:
# Determine split point for 80%/20% train/test splits
split_point = int(len(data_preprocessed) * 0.7)

# Split the data into training and test sets
train = data_preprocessed.iloc[:split_point]
test = data_preprocessed.iloc[split_point:]

In [15]:
train.shape, test.shape, test['Anomaly'].value_counts()

((496, 8),
 (214, 8),
 Anomaly
 0    195
 1     19
 Name: count, dtype: int64)

## IsolationForest Model Training

### Forward Chaining or Rolling Window Split "Walk Forward Validation": 

in this method, we create multiple train-test splits then iterate over each. In each iteration, we 'roll' the train and test window. This is a realistic way of simulating the actual application of the model, and it provides a robust estimate of the model's performance.

In [16]:
# Initialize the model
model = IsolationForest(contamination=0.01)

# Define the number of splits
n_splits = 3

# Initialize TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Loop over each split and train-validation the model
for train_index, validation_index in tscv.split(train):
    X_train, X_validation = train.iloc[train_index], train.iloc[validation_index]
    print("train_index, validation_index",train_index, validation_index)
    print(X_train.shape, X_validation.shape)
    # Train the model
    model.fit(X_train.drop(['Anomaly'], axis=1))
    
    # Predict the anomalies on validation set
    y_pred = model.predict(X_validation.drop(['Anomaly'], axis=1))
    
    # Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
    y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]
    
    # Print the F1-score for each validation
    print("F1-score for each validation", f1_score(X_validation['Anomaly'], y_pred))

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = model.predict(test.drop(['Anomaly'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score", f1_score(test['Anomaly'], y_pred_test))

train_index, validation_index [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123] [124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
 232 233 234 235 236 237 238

In [17]:
from sklearn.metrics import f1_score, make_scorer
from itertools import product

# Define the parameter grid
n_estimators = [50, 100, 200]
max_samples = ['auto', 0.5, 0.75]
contamination = [0.01, 0.05, 0.1]

# Create a product of all parameter values
params_product = list(product(n_estimators, max_samples, contamination))


In [18]:
from sklearn.metrics import classification_report, make_scorer
from itertools import product

# Placeholder for results
results = []

# Loop through all combinations and fit the model
for params in params_product:
    model = IsolationForest(n_estimators=params[0], max_samples=params[1], contamination=params[2])
    
    f1_scores = []
    classification_reports = []
    
    for train_index, validation_index in tscv.split(train):
        X_train, X_validation = train.iloc[train_index], train.iloc[validation_index]
        
        # Train the model
        model.fit(X_train.drop(['Anomaly'], axis=1))
        
        # Predict the anomalies on validation set and convert labels to 0 and 1
        y_pred = model.predict(X_validation.drop(['Anomaly'], axis=1))
        y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]
        
        f1_scores.append(f1_score(X_validation['Anomaly'], y_pred))
        
        classification_reports.append(classification_report(X_validation['Anomaly'], y_pred, target_names=['Normal', 'Anomaly'], labels=[0, 1]))

    avg_f1_score = np.mean(f1_scores)
    avg_classification_report = "\n\n".join(classification_reports)

    results.append((params, avg_f1_score, avg_classification_report))

# Sort and print results by F1 score
results.sort(key=lambda x: x[1], reverse=True)
for params, f1, report in results:
    print(f"Parameters: {params}, F1 score: {f1}")
    print(f"Classification Report: \n{report}")

Parameters: (50, 0.5, 0.05), F1 score: 0.5345710452093431
Classification Report: 
              precision    recall  f1-score   support

      Normal       0.87      0.79      0.83        91
     Anomaly       0.54      0.67      0.59        33

    accuracy                           0.76       124
   macro avg       0.70      0.73      0.71       124
weighted avg       0.78      0.76      0.77       124


              precision    recall  f1-score   support

      Normal       0.99      0.88      0.93       120
     Anomaly       0.18      0.75      0.29         4

    accuracy                           0.88       124
   macro avg       0.58      0.82      0.61       124
weighted avg       0.96      0.88      0.91       124


              precision    recall  f1-score   support

      Normal       0.97      0.90      0.94       104
     Anomaly       0.63      0.85      0.72        20

    accuracy                           0.90       124
   macro avg       0.80      0.88      0.83 

In [19]:
# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = model.predict(test.drop(['Anomaly'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score\n", classification_report(test['Anomaly'], y_pred_test))

Test F1-score
               precision    recall  f1-score   support

           0       0.93      0.87      0.90       195
           1       0.19      0.32      0.24        19

    accuracy                           0.82       214
   macro avg       0.56      0.59      0.57       214
weighted avg       0.86      0.82      0.84       214



## Train the best model

In [23]:
n_estimators = 50
max_samples = 0.5
contamination = 0.05

In [24]:
# Initialize the model
model = IsolationForest(contamination=0.01)

# Determine split point for 80%/20% train/test splits
split_point = int(len(train) * 0.8)

# Split the data into training and test sets
X_train, X_validation = train.iloc[:split_point], train.iloc[split_point:]

# Train the model
model.fit(X_train.drop(['Anomaly'], axis=1))

# Predict the anomalies on validation set
y_pred = model.predict(X_validation.drop(['Anomaly'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = model.predict(test.drop(['Anomaly'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score", f1_score(test['Anomaly'], y_pred_test))
report = classification_report(test['Anomaly'], y_pred_test, target_names=['Normal', 'Anomaly'], labels=[0, 1])
print(f"Classification Report: \n{report}")

Test F1-score 0.0
Classification Report: 
              precision    recall  f1-score   support

      Normal       0.91      1.00      0.95       195
     Anomaly       0.00      0.00      0.00        19

    accuracy                           0.91       214
   macro avg       0.46      0.50      0.48       214
weighted avg       0.83      0.91      0.87       214



In [22]:

# Initialize the model
model = IsolationForest(contamination=0.01)

# Define the number of splits
n_splits = 3

# Initialize TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Loop over each split and train-validation the model
for train_index, validation_index in tscv.split(train):
    X_train, X_validation = train.iloc[train_index], train.iloc[validation_index]
    # print("train_index, validation_index",train_index, validation_index)
    # print(X_train.shape, X_validation.shape)
    # Train the model
    model.fit(X_train.drop(['Anomaly'], axis=1))
    
    # Predict the anomalies on validation set
    y_pred = model.predict(X_validation.drop(['Anomaly'], axis=1))
    
    # Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
    y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]
    
    # Print the F1-score for each validation
    print("F1-score for each validation", f1_score(X_validation['Anomaly'], y_pred))

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = model.predict(test.drop(['Anomaly'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score", f1_score(test['Anomaly'], y_pred_test))
report = classification_report(test['Anomaly'], y_pred_test, target_names=['Normal', 'Anomaly'], labels=[0, 1])
print(f"Classification Report: \n{report}")

F1-score for each validation 0.6027397260273972
F1-score for each validation 0.14285714285714285
F1-score for each validation 0.7368421052631579
Test F1-score 0.0
Classification Report: 
              precision    recall  f1-score   support

      Normal       0.91      1.00      0.95       195
     Anomaly       0.00      0.00      0.00        19

    accuracy                           0.91       214
   macro avg       0.46      0.50      0.48       214
weighted avg       0.83      0.91      0.87       214

