# Market Direction Prediction

## 1. Data Loading and Preprocessing

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../dataset/SriLanka_Supermarket_Sales.csv')

# Clean currency columns
currency_cols = ['Unit price', 'Tax 5%', 'Sales', 'cogs', 'gross income']
for col in currency_cols:
    df[col] = df[col].replace({'LKR ': '', ',': ''}, regex=True).astype(float)

# Convert 'Date' to datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# Set 'Date' as the index
df.set_index('Date', inplace=True)

# Display the first few rows
df.head()

Unnamed: 0_level_0,Invoice ID,Branch,City,Customer type,Gender,Product line,Product Code,Unit price,Quantity,Tax 5%,Sales,Time,Payment,cogs,gross margin percentage,gross income,Rating,Inventory Level,Promotion,Restock Interval (Days)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2025-06-12,278-37-5752,Matara,Matara,Member,Male,Meat,P009,302.33,4,60.47,1269.79,6:26:21 AM,,1209.32,4.761905,60.47,8.0,118,Yes,7
2024-10-28,483-53-5622,Galle,Galle,Normal,Female,Beverages,P002,234.51,10,117.25,2462.35,5:58:43 AM,Mobile Pay,2345.1,4.761905,117.25,8.9,104,No,10
2025-07-20,562-74-1493,Colombo,Colombo,Normal,Male,Frozen Foods,P006,201.06,8,80.42,1688.9,4:30:06 PM,Credit Card,1608.48,4.761905,80.42,9.0,159,No,10
2024-08-21,904-20-6783,Kandy,Kandy,Member,Male,Household,P003,279.93,6,83.98,1763.56,4:10:05 PM,E-wallet,1679.58,4.761905,83.98,8.8,101,No,10
2025-05-18,382-87-2928,Colombo,Colombo,Normal,Male,Household,P003,414.51,2,41.45,870.47,1:51:09 AM,E-wallet,829.02,4.761905,41.45,9.2,179,No,7


## 2. Aggregate Sales Data

In [2]:
# Resample the data to get monthly sales
monthly_sales = df['Sales'].resample('M').sum().to_frame()

# Display the first few rows
monthly_sales.head()

Unnamed: 0_level_0,Sales
Date,Unnamed: 1_level_1
2024-07-31,186679.52
2024-08-31,571467.57
2024-09-30,593946.6
2024-10-31,712862.29
2024-11-30,611343.42


## 3. Feature Engineering

In [3]:
# Create lagged features
for i in range(1, 4):
    monthly_sales[f'lag_{i}'] = monthly_sales['Sales'].shift(i)

# Create rolling mean
monthly_sales['rolling_mean_3'] = monthly_sales['Sales'].rolling(window=3).mean().shift(1)

# Display the dataframe with new features
monthly_sales.head()

Unnamed: 0_level_0,Sales,lag_1,lag_2,lag_3,rolling_mean_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-07-31,186679.52,,,,
2024-08-31,571467.57,186679.52,,,
2024-09-30,593946.6,571467.57,186679.52,,
2024-10-31,712862.29,593946.6,571467.57,186679.52,450697.896667
2024-11-30,611343.42,712862.29,593946.6,571467.57,626092.153333


## 4. Create Target Variable

In [4]:
# Create the target variable 'Direction'
monthly_sales['Direction'] = (monthly_sales['Sales'] > monthly_sales['Sales'].shift(1)).astype(int)

# Drop rows with NaN values created by lagging and rolling windows
monthly_sales.dropna(inplace=True)

# Display the final dataframe
monthly_sales.head()

Unnamed: 0_level_0,Sales,lag_1,lag_2,lag_3,rolling_mean_3,Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-10-31,712862.29,593946.6,571467.57,186679.52,450697.896667,1
2024-11-30,611343.42,712862.29,593946.6,571467.57,626092.153333,0
2024-12-31,599413.33,611343.42,712862.29,593946.6,639384.103333,0
2025-01-31,638159.2,599413.33,611343.42,712862.29,641206.346667,1
2025-02-28,536084.0,638159.2,599413.33,611343.42,616305.316667,0


## 5. Model Training

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define features (X) and target (y)
X = monthly_sales.drop(['Sales', 'Direction'], axis=1)
y = monthly_sales['Direction']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

## 6. Model Evaluation

In [6]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



## 7. Predict Next Month's Direction

In [7]:
# Get the last row of the feature data to predict the next month\n
last_data_point = X.iloc[-1:].values
# Predict the direction\n
prediction = model.predict(last_data_point)
if prediction[0] == 1:
    print('The model predicts that the market direction for the next month will be UP.')
else:
    print('The model predicts that the market direction for the next month will be DOWN.')

The model predicts that the market direction for the next month will be DOWN.


  "X does not have valid feature names, but"


## 8. Save the Model

In [8]:
import joblib
# Save the model to a file\n
joblib.dump(model, '../deployment/market_direction_model.pkl')

['../deployment/market_direction_model.pkl']