In [2]:
# model_training.ipynb
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# Load the data
def load_data(file_path):
    data = pd.read_excel(file_path)
    data['StockCode'] = data['StockCode'].astype(str)  # Ensure StockCode is string
    data['CustomerID'] = data['CustomerID'].fillna(0).astype(int)
    return data

# Train the model
def train_model(data):
    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
    reference_date = data['InvoiceDate'].max()

    # Data processing for RFM (Recency, Frequency, Monetary)
    rfm = data.groupby('CustomerID').agg({
        'InvoiceDate': lambda x: (reference_date - x.max()).days,  # Recency
        'InvoiceNo': 'nunique',                                   # Frequency
        'Quantity': lambda x: (x * data.loc[x.index, 'UnitPrice']).sum()  # Monetary
    }).reset_index()

    rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

    # Scale features
    scaler = StandardScaler()
    scaled_rfm = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

    # Train KMeans clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    rfm['Cluster'] = kmeans.fit_predict(scaled_rfm)

    # Train RandomForest for supervised classification
    X_train, X_test, y_train, y_test = train_test_split(
        rfm[['Recency', 'Frequency', 'Monetary']], rfm['Cluster'], test_size=0.2, random_state=42
    )
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Save the models
    joblib.dump(clf, 'random_forest_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(kmeans, 'kmeans_model.pkl')

    return rfm, clf, scaler, kmeans

# Example Usage
file_path = 'data.xlsx'  # Change to your file path
data = load_data(file_path)
rfm, clf, scaler, kmeans = train_model(data)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Step 1: Load the data from the Excel file
data = pd.read_excel('data.xlsx')

# Step 2: Split the data into training (80%) and testing (20%) sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Create directories if they do not exist
os.makedirs('train', exist_ok=True)
os.makedirs('test', exist_ok=True)

# Step 4: Save the training and testing sets as Excel files
train_data.to_excel('train/train_data.xlsx', index=False)
test_data.to_excel('test/test_data.xlsx', index=False)

print("Data split into training and testing sets successfully.")
