In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os
import seaborn as sns
import matplotlib.pyplot as plt
import json

from sklearn.cluster import KMeans

In [None]:
# Load data
retail = pd.read_csv('OnlineRetail.csv', sep=",", encoding="ISO-8859-1", header=0)


In [None]:
# Convert CustomerID to string and create Amount column
retail['CustomerID'] = retail['CustomerID'].astype(str)
retail['Amount'] = retail['Quantity'] * retail['UnitPrice']

# Compute RFM metrics
rfm_m = retail.groupby('CustomerID')['Amount'].sum().reset_index()
rfm_f = retail.groupby('CustomerID')['InvoiceNo'].count().reset_index()
rfm_f.columns = ['CustomerID', 'Frequency']
retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'], format='%d-%m-%Y %H:%M')
max_date = max(retail['InvoiceDate'])
retail['Diff'] = max_date - retail['InvoiceDate']
rfm_p = retail.groupby('CustomerID')['Diff'].min().reset_index()
rfm_p['Diff'] = rfm_p['Diff'].dt.days
rfm = pd.merge(rfm_m, rfm_f, on='CustomerID', how='inner')
rfm = pd.merge(rfm, rfm_p, on='CustomerID', how='inner')
rfm.columns = ['CustomerID', 'Amount', 'Frequency', 'Recency']


In [None]:
# Remove outliers
Q1 = rfm.quantile(0.05)
Q3 = rfm.quantile(0.95)
IQR = Q3 - Q1
rfm = rfm[(rfm.Amount >= Q1[0] - 1.5 * IQR[0]) & (rfm.Amount <= Q3[0] + 1.5 * IQR[0])]
rfm = rfm[(rfm.Recency >= Q1[2] - 1.5 * IQR[2]) & (rfm.Recency <= Q3[2] + 1.5 * IQR[2])]
rfm = rfm[(rfm.Frequency >= Q1[1] - 1.5 * IQR[1]) & (rfm.Frequency <= Q3[1] + 1.5 * IQR[1])]

rfm_df = rfm[['Amount', 'Frequency', 'Recency']]

In [None]:
# Instantiate and fit StandardScaler
scaler = StandardScaler()
rfm_df_scaled = scaler.fit_transform(rfm_df)
rfm_df_scaled = pd.DataFrame(rfm_df_scaled)
rfm_df_scaled.columns = ['Amount', 'Frequency', 'Recency']

In [None]:
# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(rfm_df_scaled)

In [None]:
# Save the KMeans model
pickle.dump(kmeans, open('kmeans_model.pkl', 'wb'))

In [None]:
# Create a new Jupyter Notebook file
import nbformat as nbf

nb = nbf.v4.new_notebook()
code_cells = [
    nbf.v4.new_code_cell("# Example usage:\n\nimport pickle\nimport pandas as pd\n\nkmeans_model = pickle.load(open('kmeans_model.pkl', 'rb'))\n\n# Load your new data\nnew_data = pd.read_csv('new_data.csv')\n\n# Preprocess your new data\ndata_preprocessed = preprocess_data(new_data)\n\n# Predict clusters for new data\nclusters = kmeans_model.predict(data_preprocessed)\n\n# Do something with the predicted clusters...\n"),
]
nb['cells'] = code_cells

nbf.write(nb, 'model.ipynb')

In [None]:
import json

# Example JSON data
data = {'amount_img': 'Amount',
        'freq_img': 'Frequency',
        'recency_img': 'Recency'}

# Convert the dictionary to JSON
json_data = json.dumps(data)

# Print the JSON data
print(json_data)

In [None]:

import pandas as pd

# Read the CSV file
dataset = pd.read_csv('OnlineRetail.csv',sep=",", encoding="ISO-8859-1", header=0)

# Convert to JSON
json_data = dataset.to_json(orient='records')