*Training an XGBoost model on the Walmart Daily data to identify anomalies in it.*

# Start

## Import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import datetime 

In [2]:
base_folder = 'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline'
dataset_name = 'Walmart_Weekly'
train_file = base_folder + '/train/' + dataset_name +'_train.csv'
inference_file = base_folder + '/inference/' + dataset_name +'_inference.csv'

current_date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_file_path = base_folder + '/xgboostADmodel_' + dataset_name + current_date_time + '.pkl'

inference_results_file = base_folder + '/inference/' + dataset_name +'_inference_results.csv'

target = 'Weekly_Sales'
# target = 'Daily_Sales'

# Group the data by 'State' and perform lag shifting within each group
groupby_cols=['State'] 

# Define lag columns to consider
lag_columns = ['Weekly_Sales', 'Temperature']#, 'Fuel_Price', 'CPI', 'Unemployment']

# Define the lag values to be used
lags = [1, 2, 4]  # Example lag values of 1 week and 2 weeks

state_encoder = 'label_encoder_State.pkl'

In [3]:
data_path = train_file

In [4]:
data_path

'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline/train/Walmart_Weekly_train.csv'

## Load Dataset

In [5]:
def read(data_path, sheet_name = '', usecols = None):
    df = pd.DataFrame()
    if data_path.split('.')[-1] == 'xlsx':
        if sheet_name:
            df = pd.read_excel(data_path, sheet_name=sheet_name, usecols=usecols)
        else:
            df = pd.read_excel(data_path, usecols=usecols)
        print("Shape of the data in file {} is {}".format(data_path, df.shape))
    else:
        try:
            df = pd.read_csv(data_path)
            print("Shape of the data in file {} is {}".format(data_path, df.shape))
            if df.shape[0] == 0:
                print("No data in file {}".format(data_path))
        except Exception as e:
            print("Issue while reading data at {} \n{}".format(data_path, e))
    return df


def standardize_date_col(dataframe, date_col):
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], format='%d-%m-%Y', errors='coerce').fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce'))
    # Convert all dates to 'mm-dd-yyyy' format
    dataframe[date_col] = dataframe[date_col].dt.strftime('%Y-%m-%d')
    return dataframe

     

In [6]:
# Read data from csv or excel, sheet_name is the sheet in excel that contians data 
data = read(data_path, sheet_name= 'RAW')
data.head(3)

Shape of the data in file C:\Geeta\learning\projects\AnomalyDetectionSXM\Notebooks\Datasets\Pipeline/train/Walmart_Weekly_train.csv is (710, 11)


Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,State
0,2021-02-05,10397622.73,37.2,2.58,200.61,7.55,0,0,11573691.83,9513064.59,Florida
1,2021-02-12,10378496.65,36.72,2.55,200.74,7.55,1,0,11032180.28,8971553.04,Florida
2,2021-02-19,10060556.61,39.7,2.52,200.79,7.55,0,0,10763335.98,8702708.74,Florida


In [7]:
# Sort the data by the 'Date' column in ascending order
data = data.sort_values('Date')

In [8]:
data.columns 

Index(['Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Holiday_Flag', 'Anomaly', 'Sales_Amount_Upper',
       'Sales_Amount_Lower', 'State'],
      dtype='object')

## Preprocessing data

In [9]:
# Group the data by 'State' and perform lag shifting within each group
grouped = data.groupby(groupby_cols)


# Create lag features within each group
for lag in lags:
    for col in lag_columns:
        data[f'{col}_lag_{lag}'] = grouped[col].shift(lag)



In [10]:
data.columns

Index(['Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Holiday_Flag', 'Anomaly', 'Sales_Amount_Upper',
       'Sales_Amount_Lower', 'State', 'Weekly_Sales_lag_1',
       'Temperature_lag_1', 'Weekly_Sales_lag_2', 'Temperature_lag_2',
       'Weekly_Sales_lag_4', 'Temperature_lag_4'],
      dtype='object')

In [11]:
def week_of_month(date):
    year, month, day = map(int, date.split('-'))
    first_day = datetime.date(year, month, 1)
    adjusted_dom = first_day.weekday() + 1
    return (day + adjusted_dom - 1) // 7 + 1

data['Week_Of_Month'] = data['Date'].map(week_of_month)
data.tail(5)

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,State,Weekly_Sales_lag_1,Temperature_lag_1,Weekly_Sales_lag_2,Temperature_lag_2,Weekly_Sales_lag_4,Temperature_lag_4,Week_Of_Month
567,2023-10-20,7010766.86,64.35,3.89,171.39,7.76,0,0,7673165.54,6321721.07,Texas,7145936.23,62.08,7374009.41,69.81,6862360.28,71.96,4
283,2023-10-20,10675053.63,57.47,3.99,157.86,7.23,0,0,11633769.63,9552500.28,California,10637010.88,57.27,10980696.74,65.01,10377252.17,67.48,4
141,2023-10-20,9586825.29,65.2,3.61,211.89,5.67,0,0,10619124.44,8558497.2,Florida,9791831.37,59.96,10272747.97,65.9,9524678.34,68.14,4
425,2023-10-20,10839916.65,53.99,4.0,166.27,7.14,0,0,12511091.5,9660223.71,Ohio,11404108.95,50.59,11584279.26,60.68,10739024.57,61.21,4
709,2023-10-20,7009848.14,62.51,3.91,175.85,6.96,0,0,7584779.9,6363995.33,Virginia,7149626.82,61.82,7354905.93,68.47,6851231.75,70.84,4


In [12]:
# Encode categorical columns
encoder = LabelEncoder()
data['State'] = encoder.fit_transform(data['State'])

# Save the trained label encoder
joblib.dump(encoder, state_encoder)

# Split the data into features and target variable
X = data.drop(['Date', 'Anomaly', 'Sales_Amount_Upper', 'Sales_Amount_Lower'], axis=1)
y = data['Anomaly']

# Perform train-test split (80:20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Display the shapes of the datasets after splitting
X_train.shape, X_val.shape, X_test.shape


((454, 14), (114, 14), (142, 14))

In [13]:
X_train.head(3)

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Weekly_Sales_lag_1,Temperature_lag_1,Weekly_Sales_lag_2,Temperature_lag_2,Weekly_Sales_lag_4,Temperature_lag_4,Week_Of_Month
287,10153090.97,33.68,2.78,157.52,8.03,0,2,11858906.81,29.19,11616960.55,24.84,,,4
469,7591679.4,44.73,2.86,163.72,9.67,0,3,10608806.41,51.17,7059958.57,50.76,7362872.44,56.86,1
22,9494667.32,77.36,2.65,200.6,7.32,0,1,9781355.29,77.3,9380473.56,81.13,9712162.84,79.6,2


In [14]:
X_train.columns

Index(['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'Holiday_Flag', 'State', 'Weekly_Sales_lag_1', 'Temperature_lag_1',
       'Weekly_Sales_lag_2', 'Temperature_lag_2', 'Weekly_Sales_lag_4',
       'Temperature_lag_4', 'Week_Of_Month'],
      dtype='object')

## Training model

In [15]:
# Define a narrower set of hyperparameters
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train the XGBoost model with the reduced set of hyperparameters
model = XGBClassifier(**params, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model performance on the validation set
validation_accuracy = accuracy_score(y_val, y_val_pred)
validation_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy:", validation_accuracy)
print("Validation Report:")
print(validation_report)

# Make predictions on the test set
y_test_pred = model.predict(X_test)


Validation Accuracy: 0.9210526315789473
Validation Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       102
           1       0.64      0.58      0.61        12

    accuracy                           0.92       114
   macro avg       0.79      0.77      0.78       114
weighted avg       0.92      0.92      0.92       114



## Saving model

In [16]:
# Save the trained model to a file
joblib.dump(model, model_file_path)

['C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline/xgboostADmodel_Walmart_Weekly2024-04-12_23-14-36.pkl']

## Load Model

In [17]:
# Load the saved model from file
loaded_model = joblib.load(model_file_path)


## Validation

In [18]:
# Evaluate the model performance on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print("\nTest Accuracy:", test_accuracy)
print("Test Report:")
print(test_report)


Test Accuracy: 0.9577464788732394
Test Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       127
           1       1.00      0.60      0.75        15

    accuracy                           0.96       142
   macro avg       0.98      0.80      0.86       142
weighted avg       0.96      0.96      0.95       142



## Inference

In [20]:
# Load the new dataset for inference
inference_data = pd.read_csv(inference_file, usecols=['Date','Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'Holiday_Flag','State'])

# Convert the 'Date' column to datetime format
inference_data['Date'] = pd.to_datetime(inference_data['Date']).dt.strftime('%Y-%m-%d')

# Get the saved Encoder for State column 
encoder = joblib.load(state_encoder)
inference_data['State'] = encoder.fit_transform(inference_data['State'])

In [21]:
data['Date'].max(), inference_data['Date'].min(), inference_data['Date'].isna().sum()


('2023-10-20', '2023-10-27', 0)

In [22]:
cols = inference_data.columns

In [23]:
inference_data_all = pd.concat([data[cols],inference_data])
inference_data_all

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State
0,2021-02-05,10397622.73,37.20,2.58,200.61,7.55,0,1
568,2021-02-05,6575770.40,38.33,2.72,167.21,8.99,0,4
142,2021-02-05,12535669.50,32.53,2.77,150.50,8.84,0,0
426,2021-02-05,8161946.14,39.26,2.71,162.99,9.68,0,3
284,2021-02-05,12079731.73,22.87,2.82,157.34,8.03,0,2
...,...,...,...,...,...,...,...,...
0,2023-10-27,9571068.57,66.45,3.53,211.92,5.67,0,1
1,2023-10-27,10764548.66,56.59,3.91,157.86,7.23,0,0
2,2023-10-27,11128639.43,57.00,3.91,166.20,7.14,0,2
3,2023-10-27,7096813.58,64.09,3.79,171.40,7.76,0,3


In [24]:

# Sort the data by the 'Date' column in ascending order
inference_data_all = inference_data_all.sort_values('Date')

# Create lag features for the relevant columns
for lag in lags:
    for col in lag_columns:
        inference_data_all[f'{col}_lag_{lag}'] = inference_data_all[col].shift(lag)

inference_data_all

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Weekly_Sales_lag_1,Temperature_lag_1,Weekly_Sales_lag_2,Temperature_lag_2,Weekly_Sales_lag_4,Temperature_lag_4
0,2021-02-05,10397622.73,37.20,2.58,200.61,7.55,0,1,,,,,,
568,2021-02-05,6575770.40,38.33,2.72,167.21,8.99,0,4,10397622.73,37.20,,,,
142,2021-02-05,12535669.50,32.53,2.77,150.50,8.84,0,0,6575770.40,38.33,10397622.73,37.20,,
426,2021-02-05,8161946.14,39.26,2.71,162.99,9.68,0,3,12535669.50,32.53,6575770.40,38.33,,
284,2021-02-05,12079731.73,22.87,2.82,157.34,8.03,0,2,8161946.14,39.26,12535669.50,32.53,10397622.73,37.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,2023-10-27,11128639.43,57.00,3.91,166.20,7.14,0,2,10839916.65,53.99,9586825.29,65.20,7010766.86,64.35
3,2023-10-27,7096813.58,64.09,3.79,171.40,7.76,0,3,11128639.43,57.00,10839916.65,53.99,10675053.63,57.47
0,2023-10-27,9571068.57,66.45,3.53,211.92,5.67,0,1,7096813.58,64.09,11128639.43,57.00,9586825.29,65.20
1,2023-10-27,10764548.66,56.59,3.91,157.86,7.23,0,0,9571068.57,66.45,7096813.58,64.09,10839916.65,53.99


In [26]:
inference_data_all[inference_data_all['Date']=='2023-10-27']

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Weekly_Sales_lag_1,Temperature_lag_1,Weekly_Sales_lag_2,Temperature_lag_2,Weekly_Sales_lag_4,Temperature_lag_4
2,2023-10-27,11128639.43,57.0,3.91,166.2,7.14,0,2,10839916.65,53.99,9586825.29,65.2,7010766.86,64.35
3,2023-10-27,7096813.58,64.09,3.79,171.4,7.76,0,3,11128639.43,57.0,10839916.65,53.99,10675053.63,57.47
0,2023-10-27,9571068.57,66.45,3.53,211.92,5.67,0,1,7096813.58,64.09,11128639.43,57.0,9586825.29,65.2
1,2023-10-27,10764548.66,56.59,3.91,157.86,7.23,0,0,9571068.57,66.45,7096813.58,64.09,10839916.65,53.99
4,2023-10-27,6983046.05,61.13,3.82,175.87,6.96,0,4,10764548.66,56.59,9571068.57,66.45,11128639.43,57.0


In [30]:
# Get the inference data back with lag values:
inference_data = inference_data_all[inference_data_all['Date']=='2023-10-27']
# Get week of the month value too before passing it to model
inference_data['Week_Of_Month'] = inference_data['Date'].map(week_of_month)
# Display the updated new dataset with lag features
inference_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inference_data['Week_Of_Month'] = inference_data['Date'].map(week_of_month)


Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Weekly_Sales_lag_1,Temperature_lag_1,Weekly_Sales_lag_2,Temperature_lag_2,Weekly_Sales_lag_4,Temperature_lag_4,Week_Of_Month
2,2023-10-27,11128639.43,57.0,3.91,166.2,7.14,0,2,10839916.65,53.99,9586825.29,65.2,7010766.86,64.35,5
3,2023-10-27,7096813.58,64.09,3.79,171.4,7.76,0,3,11128639.43,57.0,10839916.65,53.99,10675053.63,57.47,5
0,2023-10-27,9571068.57,66.45,3.53,211.92,5.67,0,1,7096813.58,64.09,11128639.43,57.0,9586825.29,65.2,5
1,2023-10-27,10764548.66,56.59,3.91,157.86,7.23,0,0,9571068.57,66.45,7096813.58,64.09,10839916.65,53.99,5
4,2023-10-27,6983046.05,61.13,3.82,175.87,6.96,0,4,10764548.66,56.59,9571068.57,66.45,11128639.43,57.0,5


In [43]:
# 1112863943

inference_data['Weekly_Sales'].iloc[0] = 111286394300

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  inference_data['Weekly_Sales'].iloc[0] = 111286394300
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inference

In [41]:
inference_data.iloc[0]

Date                    2023-10-27
Weekly_Sales          1112863943.0
Temperature                   57.0
Fuel_Price                    3.91
CPI                          166.2
Unemployment                  7.14
Holiday_Flag                     0
State                            2
Weekly_Sales_lag_1     10839916.65
Temperature_lag_1            53.99
Weekly_Sales_lag_2      9586825.29
Temperature_lag_2             65.2
Weekly_Sales_lag_4      7010766.86
Temperature_lag_4            64.35
Week_Of_Month                    5
Name: 2, dtype: object

In [38]:
inference_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 2 to 4
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                5 non-null      object 
 1   Weekly_Sales        5 non-null      float64
 2   Temperature         5 non-null      float64
 3   Fuel_Price          5 non-null      float64
 4   CPI                 5 non-null      float64
 5   Unemployment        5 non-null      float64
 6   Holiday_Flag        5 non-null      int64  
 7   State               5 non-null      int32  
 8   Weekly_Sales_lag_1  5 non-null      float64
 9   Temperature_lag_1   5 non-null      float64
 10  Weekly_Sales_lag_2  5 non-null      float64
 11  Temperature_lag_2   5 non-null      float64
 12  Weekly_Sales_lag_4  5 non-null      float64
 13  Temperature_lag_4   5 non-null      float64
 14  Week_Of_Month       5 non-null      int64  
dtypes: float64(11), int32(1), int64(2), object(1)
memory usage: 620.0+

In [44]:
# Perform inference using the loaded model
predictions = loaded_model.predict(inference_data.drop(columns=['Date']))

# Display the predictions
predictions

array([0, 0, 0, 0, 0])

## Saving inference

# End