*Training an XGBoost model on the Walmart Daily data to identify anomalies in it.*

# Start

## Import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import datetime 
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest


import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
base_folder = 'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline'
dataset_name = 'Walmart_Weekly'
train_file = base_folder + '/train/' + dataset_name +'_train.csv'
inference_file = base_folder + '/inference/' + dataset_name +'_inference.csv'

current_date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_file_path = base_folder + '/xgboostADmodel_' + dataset_name + current_date_time + '.pkl'

inference_results_file = base_folder + '/inference/' + dataset_name +'_inference_results.csv'

target = 'Weekly_Sales'
# target = 'Daily_Sales'

# Group the data by 'State' and perform lag shifting within each group
groupby_cols=['State'] 

# Define lag columns to consider
lag_columns = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

# Define the lag values to be used
lags = [1, 2, 4]  # Example lag values of 1 week and 2 weeks

state_encoder = 'label_encoder_State.pkl'

In [3]:
data_path = train_file

In [4]:
data_path

'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline/train/Walmart_Weekly_train.csv'

## Load Dataset

In [5]:
def read(data_path, sheet_name = '', usecols = None):
    df = pd.DataFrame()
    if data_path.split('.')[-1] == 'xlsx':
        if sheet_name:
            df = pd.read_excel(data_path, sheet_name=sheet_name, usecols=usecols)
        else:
            df = pd.read_excel(data_path, usecols=usecols)
        print("Shape of the data in file {} is {}".format(data_path, df.shape))
    else:
        try:
            df = pd.read_csv(data_path)
            print("Shape of the data in file {} is {}".format(data_path, df.shape))
            if df.shape[0] == 0:
                print("No data in file {}".format(data_path))
        except Exception as e:
            print("Issue while reading data at {} \n{}".format(data_path, e))
    return df


def standardize_date_col(dataframe, date_col):
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], infer_datetime_format=True) #.fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce'))
    # dataframe[date_col] = pd.to_datetime(dataframe[date_col], format='%Y-%m-%d', errors='coerce')#.fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce'))
    # Convert all dates to 'mm-dd-yyyy' format
    # dataframe[date_col] = dataframe[date_col].dt.strftime('%Y-%m-%d')
    return dataframe

     

In [6]:
# Read data from csv or excel, sheet_name is the sheet in excel that contians data 
data = read(data_path, sheet_name= 'RAW')
data = standardize_date_col(data, 'Date')
data.head(3)

Shape of the data in file C:\Geeta\learning\projects\AnomalyDetectionSXM\Notebooks\Datasets\Pipeline/train/Walmart_Weekly_train.csv is (710, 11)


Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,State
0,2021-02-05,10397622.73,37.2,2.58,200.61,7.55,0,0,11573691.83,9513064.59,Florida
1,2021-02-12,10378496.65,36.72,2.55,200.74,7.55,1,0,11032180.28,8971553.04,Florida
2,2021-02-19,10060556.61,39.7,2.52,200.79,7.55,0,0,10763335.98,8702708.74,Florida


In [7]:
# Sort the data by the 'Date' column in ascending order
data = data.sort_values('Date')
data.reset_index(inplace=True, drop=True)

In [8]:
data.columns 

Index(['Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Holiday_Flag', 'Anomaly', 'Sales_Amount_Upper',
       'Sales_Amount_Lower', 'State'],
      dtype='object')

## Preprocessing data

In [9]:
lag_columns

['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

In [10]:
def get_lag_columns(data, groupby_cols, lag_columns):
    # Group the data by 'State' and perform lag shifting within each group
    grouped = data.groupby(groupby_cols)
    
    
    # Create lag features within each group
    for lag in lags:
        for col in lag_columns:
            data[f'{col}_lag_{lag}'] = grouped[col].shift(lag)
            data[f'{col}_lag_{lag}'] = data[f'{col}_lag_{lag}'].bfill()
    print("Before dropping NAs:", data.shape)
    data.dropna(inplace=True)
    print("After dropping NAs:", data.shape)
    return data

def get_lag_Weekly_Sales(data, groupby_cols, lag_columns):
    # Group the data by 'State' and perform lag shifting within each group
    group = data.groupby(groupby_cols)
    
    # Create lagged features
    for col in lag_columns:
        for i in range(52, 0, -1):
            data[col + '_' +str(i)] = group[col].shift(i)
    return data
        
    
# data = get_lag_columns(data, groupby_cols, lag_columns)
data = get_lag_Weekly_Sales(data, groupby_cols, lag_columns)
data.shape

(710, 271)

In [11]:
data.dropna(inplace=True)
data.shape

(450, 271)

In [12]:
data.columns

Index(['Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Holiday_Flag', 'Anomaly', 'Sales_Amount_Upper',
       'Sales_Amount_Lower',
       ...
       'Unemployment_10', 'Unemployment_9', 'Unemployment_8', 'Unemployment_7',
       'Unemployment_6', 'Unemployment_5', 'Unemployment_4', 'Unemployment_3',
       'Unemployment_2', 'Unemployment_1'],
      dtype='object', length=271)

In [13]:
def get_date_features(data, date_col):
    def week_of_month(date):
        # Get the first day of the month
        first_day = date.replace(day=1)
        # Calculate the adjusted day of the week (0=Monday, ..., 6=Sunday)
        adjusted_dom = (first_day.weekday() + 1) % 7
        # Calculate the week of the month
        week_of_month = (date.day + adjusted_dom - 1) // 7 + 1
        return week_of_month
    
    data['Week_Of_Month'] = data[date_col].map(week_of_month)
    # Create time-based features
    data[date_col] = pd.to_datetime(data[date_col])
    # data['Day_of_Week'] = data['Date'].dt.dayofweek  # Day of the week (0: Monday, 1: Tuesday, ..., 6: Sunday)
    data['Month'] = data[date_col].dt.month  # Month of the year (1 to 12)
    data['Quarter'] = data[date_col].dt.quarter  # Quarter of the year (1 to 4)
    data['Year'] = data[date_col].dt.year  # Year
    return data

data = get_date_features(data, 'Date')

In [14]:
data.sample(5)

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,...,Unemployment_6,Unemployment_5,Unemployment_4,Unemployment_3,Unemployment_2,Unemployment_1,Week_Of_Month,Month,Quarter,Year
466,2022-11-18,11153431.69,47.82,3.56,155.31,8.01,0,1,13506883.25,11425613.9,...,8.01,8.01,8.01,8.01,8.01,8.01,3,11,4,2022
562,2023-03-31,9908450.93,65.74,3.83,210.06,6.41,0,0,11640852.67,9580225.42,...,6.41,6.41,6.41,6.41,6.41,6.41,5,3,1,2023
549,2023-03-10,10291510.29,53.28,3.63,209.75,6.41,0,0,11368810.47,9308183.23,...,6.41,6.41,6.41,6.41,6.41,6.41,2,3,1,2023
556,2023-03-24,9664462.49,58.94,3.77,209.97,6.41,0,0,11291870.79,9231243.55,...,6.41,6.41,6.41,6.41,6.41,6.41,4,3,1,2023
349,2022-06-03,11421205.89,68.15,3.87,153.68,8.36,0,0,12020871.82,9939602.47,...,8.36,8.36,8.36,8.36,8.36,8.36,1,6,2,2022


In [15]:
# Encode categorical columns
encoder = LabelEncoder()
data['State'] = encoder.fit_transform(data['State'])

# Save the trained label encoder
joblib.dump(encoder, state_encoder)

['label_encoder_State.pkl']

### Feature selection

In [16]:
# drop_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
# data.drop(columns=drop_cols, inplace=True)

In [17]:
data.sample(5)

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,...,Unemployment_6,Unemployment_5,Unemployment_4,Unemployment_3,Unemployment_2,Unemployment_1,Week_Of_Month,Month,Quarter,Year
438,2022-10-07,7420671.08,66.94,3.5,167.79,8.92,0,0,7688144.87,6336700.4,...,9.29,9.29,9.29,9.29,9.29,9.29,2,10,4,2022
416,2022-09-09,9553921.17,73.17,3.55,205.24,7.1,1,0,10296654.3,8236027.05,...,7.1,7.1,7.1,7.1,7.1,7.1,2,9,3,2022
532,2023-02-17,11491173.47,33.07,3.74,164.23,7.2,0,0,12269936.54,9419068.75,...,7.2,7.2,7.2,7.2,7.2,7.2,3,2,1,2023
534,2023-02-17,11284939.9,42.37,3.46,209.22,6.41,0,1,11179403.79,9118776.54,...,6.41,6.41,6.41,6.41,6.41,6.41,3,2,1,2023
507,2023-01-13,9710842.8,36.15,3.54,163.71,7.2,0,1,12943328.33,10092460.55,...,7.28,7.28,7.28,7.28,7.28,7.2,2,1,1,2023


### Split Dataset

In [18]:
train_size = int(np.floor(data.shape[0]*0.8*0.7))
val_size = int(np.floor(data.shape[0]*0.7) - train_size)
test_size = int(data.shape[0] - train_size - val_size)

train_size, val_size, test_size, train_size + val_size + test_size, data.shape[0]

(251, 64, 135, 450, 450)

In [19]:
train_data = data.iloc[:train_size]
val_data = data.iloc[train_size:train_size+val_size]
test_data = data.iloc[:test_size]

train_data.shape, val_data.shape, test_data.shape, test_data['Anomaly'].value_counts()

((251, 275),
 (64, 275),
 (135, 275),
 Anomaly
 0    131
 1      4
 Name: count, dtype: int64)

In [20]:
def get_Xy(data, drop_cols, target_col):
    X = data.drop(drop_cols, axis=1)
    y = data['Anomaly']
    return X,y


    
drop_cols = ['Date', 'Anomaly']  #, 'Sales_Amount_Upper', 'Sales_Amount_Lower'

X_train, y_train = get_Xy(train_data, drop_cols, 'Anomaly')
X_val, y_val = get_Xy(val_data, drop_cols, 'Anomaly')
X_test, y_test = get_Xy(test_data, drop_cols, 'Anomaly')
X_test2, y_test2 = X_test.iloc[-20:], y_test[-20:]
X_test, y_test = X_test.iloc[:-20], y_test[:-20]

X_train.shape, X_val.shape, X_test.shape, X_test2.shape

((251, 273), (64, 273), (115, 273), (20, 273))

In [21]:
X_train.head(3)

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Sales_Amount_Upper,Sales_Amount_Lower,State,Weekly_Sales_52,...,Unemployment_6,Unemployment_5,Unemployment_4,Unemployment_3,Unemployment_2,Unemployment_1,Week_Of_Month,Month,Quarter,Year
260,7444909.07,34.24,3.13,164.55,9.49,0,7850346.28,6498901.81,3,8161946.14,...,9.67,9.67,9.49,9.49,9.49,9.49,1,2,1,2022
261,6705436.37,32.87,3.12,168.75,8.68,0,6946379.76,5725595.2,4,6575770.4,...,8.85,8.85,8.68,8.68,8.68,8.68,1,2,1,2022
262,9985425.58,33.06,2.99,202.16,7.2,0,10587907.7,8527280.46,1,10397622.73,...,7.4,7.4,7.2,7.2,7.2,7.2,1,2,1,2022


In [24]:
X_train.columns

Index(['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'Holiday_Flag', 'Sales_Amount_Upper', 'Sales_Amount_Lower', 'State',
       'Weekly_Sales_52',
       ...
       'Unemployment_6', 'Unemployment_5', 'Unemployment_4', 'Unemployment_3',
       'Unemployment_2', 'Unemployment_1', 'Week_Of_Month', 'Month', 'Quarter',
       'Year'],
      dtype='object', length=273)

## Training model

### Add more features:
* rolling statistics for weekly sales

In [25]:
# data[f'{col}_lag_{lag}'] = data[f'{col}_lag_{lag}'].bfill()
# # Calculate rolling statistics
# window = 4  # Window size for rolling statistics
# data['Rolling_Mean_Weekly_Sales'] = data['Weekly_Sales'].rolling(window=window).mean()  # Rolling mean of weekly sales
# data['Rolling_Mean_Weekly_Sales'] = data['Rolling_Mean_Weekly_Sales'].bfill() # Backfill NAs, TODO: Change this to see if results improve

# data['Rolling_Std_Weekly_Sales'] = data['Weekly_Sales'].rolling(window=window).std()  # Rolling standard deviation of weekly sales
# data['Rolling_Std_Weekly_Sales'] = data['Rolling_Std_Weekly_Sales'].bfill()

# data.head()

### Resampling technique

In [26]:
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import make_pipeline

# # Define the resampling strategy using a pipeline
# resample_pipeline = make_pipeline(SMOTE(random_state=42), RandomUnderSampler(random_state=42))

# # Apply the resampling strategy to the training data
# X_resampled, y_resampled = resample_pipeline.fit_resample(X_train, y_train)

### Grid Search

In [30]:
from sklearn.model_selection import GridSearchCV

# Define the XGBoost model
xgb_model = XGBClassifier(objective="binary:logistic", random_state=42)

# Define the hyperparameters grid for grid search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.8],
    'learning_rate': [0.1, 0.15, 0.2, 0.25],
    'colsample_bytree': [0.5, 0.8],
    # 'reg_alpha': [0, 0.5, 1],
    # 'reg_lambda': [0, 0.5, 1],
    # 'gamma': [0, 0.2],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data (do not run this line if you only want the code for setting up the grid search)
grid_search.fit(X_train, y_train)

# Display the grid search results
print("Grid Search Results:")
print(grid_search.cv_results_)

# Retrieve the best model from the grid search
best_model = grid_search.best_estimator_
print("\nBest Model:")
print(best_model)

Grid Search Results:
{'mean_fit_time': array([0.4366724 , 0.45224438, 0.71659408, 0.75938945, 0.46317549,
       0.49464908, 0.81478834, 0.91799836, 0.47080965, 0.51360831,
       0.75159535, 0.77537436, 0.44454899, 0.46144567, 0.7331737 ,
       0.72161007, 0.51987977, 0.5476645 , 0.71962919, 0.7726953 ,
       0.45983701, 0.4881989 , 0.72388659, 0.75936656, 0.43595705,
       0.45358233, 0.72016311, 0.72081623, 0.43546481, 0.4738338 ,
       0.71401262, 0.72120447, 0.44974961, 0.47253065, 0.71955295,
       0.7330193 , 0.42938695, 0.44912157, 0.70454116, 0.70615296,
       0.44814191, 0.45866804, 0.68513889, 0.73026466, 0.45199256,
       0.46545   , 0.69870267, 0.69842105, 0.50203819, 0.52455797,
       0.79374781, 0.82831573, 0.52136774, 0.55937233, 0.81590977,
       0.86355858, 0.50074782, 0.57075644, 0.79854522, 0.86159039,
       0.47962141, 0.49799161, 0.76699896, 0.78965559, 0.48068833,
       0.53090053, 0.75929346, 0.80938578, 0.47860675, 0.52487092,
       0.7694097 , 0.81

In [34]:
# Make predictions on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model performance on the validation set
validation_accuracy = accuracy_score(y_val, y_val_pred)
validation_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy:", validation_accuracy)
print("Validation Report:")
print(validation_report)

# Make predictions on the test set
y_test_pred = best_model.predict(X_test)
print("Test Report:\n", classification_report(y_test, y_test_pred))


Validation Accuracy: 0.6875
Validation Report:
              precision    recall  f1-score   support

           0       0.85      0.79      0.81        56
           1       0.00      0.00      0.00         8

    accuracy                           0.69        64
   macro avg       0.42      0.39      0.41        64
weighted avg       0.74      0.69      0.71        64

Test Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       113
           1       1.00      1.00      1.00         2

    accuracy                           1.00       115
   macro avg       1.00      1.00      1.00       115
weighted avg       1.00      1.00      1.00       115



Validation Accuracy: 0.6936936936936937

Validation Report:

              precision    recall  f1-score   support

           0       0.69      1.00      0.81        74
           1       1.00      0.08      0.15        37

    accuracy                           0.69       111
   macro avg       0.84      0.54      0.48       111
weighted avg       0.79      0.69      0.59       111

In [35]:
# print(best_model)
# Get the parameters of the trained model
params = best_model.get_params()

# Print all the parameters
for key, value in params.items():
    print(f"{key}: {value}")

objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: 0.5
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: None
feature_types: None
gamma: None
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.2
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 3
max_leaves: None
min_child_weight: None
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 100
n_jobs: None
num_parallel_tree: None
random_state: 42
reg_alpha: None
reg_lambda: None
sampling_method: None
scale_pos_weight: None
subsample: 0.5
tree_method: None
validate_parameters: None
verbosity: None


### Train Model

In [27]:
X_train.columns

Index(['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'Holiday_Flag', 'Sales_Amount_Upper', 'Sales_Amount_Lower', 'State',
       'Weekly_Sales_52',
       ...
       'Unemployment_6', 'Unemployment_5', 'Unemployment_4', 'Unemployment_3',
       'Unemployment_2', 'Unemployment_1', 'Week_Of_Month', 'Month', 'Quarter',
       'Year'],
      dtype='object', length=273)

In [38]:
# Define a narrower set of hyperparameters
params = {
    'n_estimators': 100,
    'max_depth': 3,
    'learning_rate': 0.2,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
}

# Train the XGBoost model with the reduced set of hyperparameters
model = XGBClassifier(**params, objective="binary:logistic", random_state=42)
mod el.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model performance on the validation set
validation_accuracy = accuracy_score(y_val, y_val_pred)
validation_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy:", validation_accuracy)
print("Validation Report:")
print(validation_report)

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = model.predict(X_test)

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]
print("###"*50)
# Print the F1-score on Test set
print("Test Report\n", classification_report(y_test, y_pred_test))


Validation Accuracy: 0.6875
Validation Report:
              precision    recall  f1-score   support

           0       0.85      0.79      0.81        56
           1       0.00      0.00      0.00         8

    accuracy                           0.69        64
   macro avg       0.42      0.39      0.41        64
weighted avg       0.74      0.69      0.71        64

######################################################################################################################################################
Test Report
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       113
           1       0.00      0.00      0.00         2

    accuracy                           0.98       115
   macro avg       0.49      0.50      0.50       115
weighted avg       0.97      0.98      0.97       115



Validation Accuracy: 0.8070175438596491
Validation Report:
              precision    recall  f1-score   support

           0       0.80      0.96      0.87        79
           1       0.84      0.46      0.59        35

    accuracy                           0.81       114
   macro avg       0.82      0.71      0.73       114
weighted avg       0.81      0.81      0.79       114

In [37]:
# print(best_model)
# Get the parameters of the trained model
params = model.get_params()

# Print all the parameters
for key, value in params.items():
    print(f"{key}: {value}")

objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: 0.5
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: None
feature_types: None
gamma: None
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.2
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 3
max_leaves: None
min_child_weight: None
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 100
n_jobs: None
num_parallel_tree: None
random_state: 42
reg_alpha: None
reg_lambda: None
sampling_method: None
scale_pos_weight: None
subsample: 0.5
tree_method: None
validate_parameters: None
verbosity: None


### Forward Chaining or Rolling Window Split "Walk Forward Validation"

In [None]:
train_data.columns

In [None]:
# Define a narrower set of hyperparameters
params = {
    'n_estimators': 100,
    'max_depth': 3,
    'learning_rate': 0.1,
    'subsample': 0.5,
    # 'colsample_bytree': 0.8,
}

# Define model
XGBmodel = XGBClassifier(**params, objective="binary:logistic", random_state=42)

# Define the number of splits
n_splits = 3

# Initialize TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Loop over each split and train-validation the model
for train_index, validation_index in tscv.split(train_data):
    X_train_split, X_validation_split = train_data.iloc[train_index], train_data.iloc[validation_index]
    
    # Train the model
    XGBmodel.fit(X_train_split.drop(['Anomaly','Date'], axis=1), X_train_split['Anomaly'])
    
    # Predict the anomalies on validation set
    y_pred = XGBmodel.predict(X_validation_split.drop(['Anomaly','Date'], axis=1))
    
    # Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
    y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]
    
    # Print the F1-score for each validation
    print("F1-score for each validation", f1_score(X_validation_split['Anomaly'], y_pred))

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = XGBmodel.predict(test_data.drop(['Anomaly','Date'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score", classification_report(test_data['Anomaly'], y_pred_test))

## Train Isolation Forest

In [None]:
# Initialize the model
IFmodel = IsolationForest(contamination=0.01)

# Define the number of splits
n_splits = 3

# Initialize TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Loop over each split and train-validation the model
for train_index, validation_index in tscv.split(train_data):
    X_train_split, X_validation_split = train_data.iloc[train_index], train_data.iloc[validation_index]
    # print("train_index, validation_index",train_index, validation_index)
    # print(X_train.shape, X_validation.shape)
    # Train the model
    IFmodel.fit(X_train_split.drop(['Anomaly','Date'], axis=1))
    
    # Predict the anomalies on validation set
    y_pred = IFmodel.predict(X_validation_split.drop(['Anomaly','Date'], axis=1))
    
    # Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
    y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]
    
    # Print the F1-score for each validation
    print("F1-score for each validation", f1_score(X_validation_split['Anomaly'], y_pred))

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = IFmodel.predict(test_data.drop(['Anomaly','Date'], axis=1))

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score", classification_report(test_data['Anomaly'], y_pred_test))
test_data['Anomaly_pred'] = y_pred_test

In [None]:
# Initialize the model
IFmodel = IsolationForest(contamination=0.01)

# X_train, y_train

IFmodel.fit(X_train)

# Predict the anomalies on validation set
y_pred = IFmodel.predict(X_val)

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred = [1 if prediction==-1 else 0 for prediction in y_pred]

# Print the F1-score for each validation
print("F1-score validation", f1_score(y_val, y_pred))

# As a final step we will use the best model from the validation step to predict the anomalies on test set.
y_pred_test = IFmodel.predict(X_test)

# Change the anomaly labels (from -1, 1) to (1, 0) similar to 'Anomaly' column
y_pred_test = [1 if prediction==-1 else 0 for prediction in y_pred_test]

# Print the F1-score on Test set
print("Test F1-score", classification_report(y_test, y_pred_test))

## Saving model

In [None]:
# Save the trained model to a file
joblib.dump(model, model_file_path)

## Load Model

In [None]:
# Load the saved model from file
loaded_model = joblib.load(model_file_path)


## Validation

In [None]:
# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Evaluate the model performance on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print("\nTest Accuracy:", test_accuracy)
print("Test Report:")
print(test_report)

In [None]:
# Make predictions on the test set
y_test_pred2 = model.predict(X_test2)

# Evaluate the model performance on the test set
test_accuracy = accuracy_score(y_test2, y_test_pred2)
test_report = classification_report(y_test2, y_test_pred2)

print("\nTest Accuracy:", test_accuracy)
print("Test Report:")
print(test_report)

In [None]:

y_test_pred = model.predict(X_test2)

# Evaluate the model performance on the test set
test_accuracy = accuracy_score(y_test2, y_test_pred)
test_report = classification_report(y_test2, y_test_pred)

print("\nTest Accuracy:", test_accuracy)
print("Test Report:")
print(test_report)

In [None]:
y_test2.value_counts()

In [None]:
test_data.shape, len(y_test_pred)

In [None]:
# test_data['Anomaly_pred'] = y_test_pred
test_data[test_data['Anomaly']==1]

In [None]:
# test_df = X_test
# test_df['Anomaly'] = y_test
# test_df['Anomaly_pred'] = y_test_pred
test_data[(test_data['Anomaly']==1) & (test_data['State']==2)]

In [None]:
import plotly.graph_objects as go
for state in test_data['State'].unique():
    # Create a Figure for the current state
    fig = go.Figure()
    
    # Filter the DataFrame for the current state
    state_df = test_data[test_data['State'] == state]
    state_df.reset_index(inplace=True)
    
    # Create a Scatter plot for the target values
    fig.add_trace(go.Scatter(
        x=state_df.index,
        y=state_df[target],
        mode='lines+markers',
        name=target,
        line=dict(color='black', dash='dash')
    ))
    
    # Filter the data where anomalies are present (Anomaly == 1)
    anomalies = state_df[state_df['Anomaly'] == 1]
    
    # Add a scatter trace for the anomaly points
    fig.add_trace(go.Scatter(
        x=anomalies.index,
        y=anomalies[target],
        mode='markers',
        name='Anomaly',
        marker=dict(color='red', size=10, symbol='x')
    ))
    
    # Update layout settings
    fig.update_layout(
        title=f"Anomalies for {state}",
        xaxis_title="Date",
        yaxis_title=target,
        template="plotly_white",
        autosize=False,
        width=1200,
        height=400
    )
    
    # Show the plot
    fig.show()


     # Create a Figure for the current state
    fig = go.Figure()
    
    # Filter the DataFrame for the current state
    state_df = test_data[test_data['State'] == state]
    state_df.reset_index(inplace=True)
    
    # Create a Scatter plot for the target values
    fig.add_trace(go.Scatter(
        x=state_df.index,
        y=state_df[target],
        mode='lines+markers',
        name=target,
        line=dict(color='black', dash='dash')
    ))
    
    # Filter the data where anomalies are present (Anomaly == 1)
    anomalies = state_df[state_df['Anomaly_pred'] == 1]
    
    # Add a scatter trace for the anomaly points
    fig.add_trace(go.Scatter(
        x=anomalies.index,
        y=anomalies[target],
        mode='markers',
        name='Anomaly Pred',
        marker=dict(color='red', size=10, symbol='x')
    ))
    
    # Update layout settings
    fig.update_layout(
        title=f"Predicted Anomalies for {state}",
        xaxis_title="Date",
        yaxis_title=target,
        template="plotly_white",
        autosize=False,
        width=1200,
        height=400
    )
    
    # Show the plot
    fig.show()

    print("####"*30)

## Inference

In [None]:
# Load the new dataset for inference
inference_data = pd.read_csv(inference_file, usecols=['Date','Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'Holiday_Flag','State'])

# Convert the 'Date' column to datetime format
# inference_data['Date'] = pd.to_datetime(inference_data['Date']).dt.strftime('%Y-%m-%d')
inference_data = standardize_date_col(inference_data, 'Date')

# Get the saved Encoder for State column 
encoder = joblib.load(state_encoder)
inference_data['State'] = encoder.fit_transform(inference_data['State'])

In [None]:
data['Date'].max(), inference_data['Date'].min(), inference_data['Date'].isna().sum()


In [None]:
cols = inference_data.columns

In [None]:
# Calculate the next four Fridays after 2023-10-27
start_date = pd.to_datetime('2023-10-27')
next_fridays = [start_date + pd.DateOffset(weeks=i) for i in range(1, 5)]

# Function to generate synthetic data with a fluctuation of 5%
def generate_synthetic_data(base_data, date):
    # Copy the base data to start with
    synthetic_data = base_data.copy()
    # Update the date to the current date
    synthetic_data['Date'] = date.strftime('%Y-%m-%d')
    # Apply a fluctuation of ±5% to the numeric columns
    fluctuation = np.random.uniform(-0.05, 0.05)
    synthetic_data['Weekly_Sales'] *= (1 + np.abs(fluctuation*100))
    synthetic_data['Temperature'] *= (1 + fluctuation)
    synthetic_data['Fuel_Price'] *= (1 + fluctuation)
    synthetic_data['CPI'] *= (1 + fluctuation)
    synthetic_data['Unemployment'] *= (1 + fluctuation)
    # Return the synthetic data
    return synthetic_data

# List to hold synthetic data
synthetic_data_list = inference_data

# Generate synthetic data for each next Friday
for date in next_fridays:
    synthetic_data = generate_synthetic_data(synthetic_data_list, date)
    inference_data = pd.concat([inference_data, synthetic_data])

# inference_data = synthetic_data_list
# del synthetic_data_list

# clean up the data:
# Identify numeric columns
numeric_columns = inference_data.select_dtypes(include=[int, float]).columns

# Round the numeric values in these columns to 2 decimal places
inference_data[numeric_columns] = inference_data[numeric_columns].round(2)

inference_data.reset_index(inplace=True, drop=True)
# Print the synthetic DataFrame
inference_data


In [None]:
inference_data = standardize_date_col(inference_data, 'Date')
inference_data_all = pd.concat([data[cols],inference_data])
inference_data_all

In [None]:

# Sort the data by the 'Date' column in ascending order
inference_data_all = inference_data_all.sort_values('Date')

# Create lag features for the relevant columns
print(groupby_cols, lag_columns)
inference_data_all = get_lag_columns(inference_data_all, groupby_cols, lag_columns)

inference_data_all

In [None]:
inference_data_all[inference_data_all['Date']>='2023-10-27']

In [None]:
# Get the inference data back with lag values:
inference_data = inference_data_all[inference_data_all['Date']>='2023-10-27']
# Get week of the month value too before passing it to model
inference_data = get_date_features(inference_data, 'Date')
# inference_data['Week_Of_Month'] = inference_data['Date'].map(week_of_month)
# Display the updated new dataset with lag features
inference_data.head()

In [None]:
inference_data.columns

In [None]:
# 1112863943

inference_data['Weekly_Sales'] = 100
inference_data

In [None]:
# Perform inference using the loaded model
predictions = model.predict(inference_data.drop(columns=['Date']))

# Display the predictions
predictions

## Saving inference

# End