<a href="https://colab.research.google.com/github/GuanRuLai/Python-Project-DeepFM-Commodity-Recommendation-Model/blob/main/deepfm_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download dataset from kaggle

## Set API key

In [None]:
import os

os.environ['KAGGLE_USERNAME'] = "guanrulai"
os.environ['KAGGLE_KEY'] = "68b76f95e8253d46ffc725d7313443ed"

## Download dataset

In [None]:
!kaggle datasets download -d devsubhash/flipkart-mobiles-dataset

Dataset URL: https://www.kaggle.com/datasets/devsubhash/flipkart-mobiles-dataset
License(s): CC0-1.0
Downloading flipkart-mobiles-dataset.zip to /content
  0% 0.00/54.4k [00:00<?, ?B/s]
100% 54.4k/54.4k [00:00<00:00, 9.56MB/s]


## Uncompress the dataset

In [None]:
import zipfile

with zipfile.ZipFile("flipkart-mobiles-dataset.zip", "r") as zip_ref:
    zip_ref.extractall("data")

# Install necessary library

In [None]:
!pip install deepctr-torch torch scikit-learn tensorflow

Collecting deepctr-torch
  Downloading deepctr_torch-0.2.9-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
C

# Import library

In [None]:
import torch
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import DeepFM
from torch.optim import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re
import pickle

# Read data

In [None]:
df = pd.read_csv("/content/data/Flipkart_Mobiles.csv")
print(df.head(10))

  Brand Model            Color Memory Storage  Rating  Selling Price  \
0  OPPO   A53  Moonlight Black   4 GB   64 GB     4.5          11990   
1  OPPO   A53       Mint Cream   4 GB   64 GB     4.5          11990   
2  OPPO   A53  Moonlight Black   6 GB  128 GB     4.3          13990   
3  OPPO   A53       Mint Cream   6 GB  128 GB     4.3          13990   
4  OPPO   A53   Electric Black   4 GB   64 GB     4.5          11990   
5  OPPO   A53   Electric Black   6 GB  128 GB     4.3          13990   
6  OPPO   A12        Deep Blue   4 GB   64 GB     4.4          10490   
7  OPPO   A12            Black   3 GB   32 GB     4.4           9490   
8  OPPO   A12             Blue   3 GB   32 GB     4.4           9490   
9  OPPO   A12   Flowing Silver   3 GB   32 GB     4.4           9490   

   Original Price  
0           15990  
1           15990  
2           17990  
3           17990  
4           15990  
5           17990  
6           11990  
7           10990  
8           10990  
9      

# Adjust data

In [None]:
# Handle missing values
print(df.isnull().sum())
df = df.dropna()
df = df.reset_index(drop=True)
# print(df.head())

# Add "Price Difference" column to replace the "Original Price" column
df["Price Difference"] = df["Original Price"] - df["Selling Price"]
df = df.drop(columns=["Original Price"])

# Modify "Memory" and "Storage" column
df.rename(columns={"Memory": "Memory(GB/MB)", "Storage": "Storage(GB)"}, inplace=True)
df["Memory(GB/MB)"] = df["Memory(GB/MB)"].str.replace(r"\D", "", regex=True).astype(float)
df["Storage(GB)"] = df["Storage(GB)"].str.replace(r"\D", "", regex=True).astype(int)

# Add "Customer Name" column
num_rows = len(df)

num_customers = -(-num_rows // 3) # Ensure that each customer appears at least three times.
customer_ids = np.repeat(np.arange(1, num_customers + 1), 3)
np.random.shuffle(customer_ids)
df.insert(0, "Customer ID", customer_ids[:num_rows]) # Insert leftmost column

# Add "Clicked" column
df["Clicked"] = np.random.choice([0, 1], size=num_rows)

print(df.head())
print(df.dtypes)

Brand               0
Model               0
Color               0
Memory             43
Storage            39
Rating            144
Selling Price       0
Original Price      0
dtype: int64
   Customer ID Brand Model            Color  Memory(GB/MB)  Storage(GB)  \
0          702  OPPO   A53  Moonlight Black            4.0           64   
1           83  OPPO   A53       Mint Cream            4.0           64   
2          219  OPPO   A53  Moonlight Black            6.0          128   
3          662  OPPO   A53       Mint Cream            6.0          128   
4          919  OPPO   A53   Electric Black            4.0           64   

   Rating  Selling Price  Price Difference  Clicked  
0     4.5          11990              4000        1  
1     4.5          11990              4000        1  
2     4.3          13990              4000        1  
3     4.3          13990              4000        1  
4     4.5          11990              4000        1  
Customer ID           int64
Brand   

# Data preprocessing

In [None]:
# Define feature categories
emb_features = ["Customer ID", "Brand", "Model", "Color", "Memory(GB/MB)", "Storage(GB)"]
dense_features_standard = ["Selling Price", "Price Difference"]
dense_features_minmax = ["Rating"]

# Label Encoding for categorical features
label_encoders = {}

for feat in emb_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat]) + 1 # Make sure Label Encoding starts from 1 (0 is reserved for padding)

    label_encoders[feat] = lbe

# Store into pickle file
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

# Standard scaling
scaler1 = StandardScaler()
df[dense_features_standard] = scaler1.fit_transform(df[dense_features_standard])

# Min-max scaling for
scaler2 = MinMaxScaler()
df[dense_features_minmax] = scaler2.fit_transform(df[dense_features_minmax])

# Store into pickle file
with open("scalers.pkl", "wb") as f:
    pickle.dump({"standard": scaler1, "minmax": scaler2}, f)

# Split data into training and testing set
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create independent variables required to build the DeepFM model
# Embedding is used for category features, and numerical features are input directly.
feature_columns = (
    [SparseFeat(feat, df[feat].nunique() + 2, embedding_dim=16) for feat in emb_features] +
    [DenseFeat(feat, 1) for feat in dense_features_standard] +
    [DenseFeat(feat, 1) for feat in dense_features_minmax]
)
feature_names = get_feature_names(feature_columns)

train_model_input = {
    name: np.array(train_data[name].values, dtype=np.int32) if name in emb_features else np.array(train_data[name].values, dtype=np.float32)
    for name in feature_names
}
test_model_input = {
    name: np.array(test_data[name].values, dtype=np.int32) if name in emb_features else np.array(test_data[name].values, dtype=np.float32)
    for name in feature_names
}

print(train_model_input)
print("-" * 100)
# print(test_model_input)
# print("-" * 100)

# Create dependent variables
y_train_ctr = np.array(train_data['Clicked'].values, dtype=np.float32)
print(y_train_ctr)
print("-" * 100)

y_test_ctr = np.array(test_data['Clicked'].values, dtype=np.float32)
# print(y_test_ctr)
# print("-" * 100)
# print(y_test_pctcvr)

{'Customer ID': array([ 95, 649, 146, ..., 193, 331, 653], dtype=int32), 'Brand': array([12, 10,  4, ...,  2,  2, 11], dtype=int32), 'Model': array([123, 263,  41, ..., 866, 856,   5], dtype=int32), 'Color': array([352, 209, 390, ..., 526, 244,  46], dtype=int32), 'Memory(GB/MB)': array([3, 3, 4, ..., 1, 4, 4], dtype=int32), 'Storage(GB)': array([ 8,  7,  8, ...,  6, 15,  3], dtype=int32), 'Selling Price': array([-0.36795014, -0.44345537,  1.7561316 , ...,  0.5804727 ,
        3.6636717 , -0.87991416], dtype=float32), 'Price Difference': array([-0.35683632, -0.35683632, -0.35683632, ..., -0.35683632,
       -0.35683632, -0.35683632], dtype=float32), 'Rating': array([0.7777778, 0.6666667, 0.8148148, ..., 0.8148148, 0.8518519,
       0.7037037], dtype=float32)}
----------------------------------------------------------------------------------------------------
[1. 1. 1. ... 0. 0. 1.]
----------------------------------------------------------------------------------------------------


# Define DeepFM model

In [None]:
device = "cpu"

model_ctr = DeepFM(
    linear_feature_columns=feature_columns,
    dnn_feature_columns=feature_columns,
    task="binary",
    device=device
)

# Model compiling

In [None]:
model_ctr.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['auc'])

# Model training

In [None]:
model_ctr.fit(train_model_input,
              y_train_ctr,
              batch_size=256,
              epochs=30,
              verbose=2,
              validation_split=0.2)

cpu
Train on 1853 samples, validate on 464 samples, 8 steps per epoch
Epoch 1/30
0s - loss:  0.6934 - auc:  0.5106 - val_auc:  0.5055
Epoch 2/30
0s - loss:  0.6898 - auc:  0.7705 - val_auc:  0.5097
Epoch 3/30
0s - loss:  0.6856 - auc:  0.8387 - val_auc:  0.5094
Epoch 4/30
0s - loss:  0.6787 - auc:  0.8710 - val_auc:  0.5122
Epoch 5/30
0s - loss:  0.6659 - auc:  0.8971 - val_auc:  0.5155
Epoch 6/30
0s - loss:  0.6399 - auc:  0.9231 - val_auc:  0.5176
Epoch 7/30
0s - loss:  0.5945 - auc:  0.9377 - val_auc:  0.5202
Epoch 8/30
0s - loss:  0.5191 - auc:  0.9566 - val_auc:  0.5165
Epoch 9/30
0s - loss:  0.4250 - auc:  0.9677 - val_auc:  0.5145
Epoch 10/30
0s - loss:  0.3373 - auc:  0.9758 - val_auc:  0.5122
Epoch 11/30
0s - loss:  0.2630 - auc:  0.9872 - val_auc:  0.5139
Epoch 12/30
0s - loss:  0.2012 - auc:  0.9924 - val_auc:  0.5064
Epoch 13/30
0s - loss:  0.1492 - auc:  0.9957 - val_auc:  0.5126
Epoch 14/30
0s - loss:  0.1106 - auc:  0.9972 - val_auc:  0.5040
Epoch 15/30
0s - loss:  0.083

<tensorflow.python.keras.callbacks.History at 0x7f7f745b5cd0>

# Model prediction

In [None]:
pred_ctr = model_ctr.predict(test_model_input, batch_size=256)
pred_ctr = pred_ctr.flatten()

result_df = pd.DataFrame({
    "Customer ID": test_data["Customer ID"],
    "pred_CTR": pred_ctr.flatten()
})

result_df.head(10)

Unnamed: 0,Customer ID,pred_CTR
141,484,2.110983e-05
2477,377,0.002845686
2781,111,0.9969607
1710,768,0.9783269
1926,73,2.400805e-09
544,14,8.148954e-10
2404,796,1.921398e-06
1149,630,0.008720703
1133,940,2.992912e-05
322,809,0.9949163


# Make confusion matrix & Model evaluation

In [None]:
pred_labels = (pred_ctr > 0.5).astype(int)
cm = confusion_matrix(y_test_ctr, pred_labels)
print(cm)
print(accuracy_score(y_test_ctr, pred_labels))

[[165 121]
 [163 131]]
0.5103448275862069


# Application: For each user, the 5 products with the highest predicted CTR are selected as recommended results.

In [None]:
test_data["pred_CTR"] = pred_ctr.flatten()

# Restore label encoding
with open("label_encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

for feat in emb_features:
  test_data[feat] = test_data[feat].apply(lambda x: label_encoders[feat].inverse_transform([x - 1])[0] if x > 0 else "Unknown")

# Restore scaling
with open("scalers.pkl", "rb") as f:
    scalers = pickle.load(f)

test_data[dense_features_standard] = scalers["standard"].inverse_transform(test_data[dense_features_standard].values)
test_data[dense_features_minmax] = scalers["minmax"].inverse_transform(test_data[dense_features_minmax].values)
test_data[dense_features_standard] = pd.DataFrame(test_data[dense_features_standard], columns=dense_features_standard)
test_data[dense_features_minmax] = pd.DataFrame(test_data[dense_features_minmax], columns=dense_features_minmax)

top_recommendations = test_data.groupby("Customer ID").apply(lambda x: x.nlargest(5, "pred_CTR")).reset_index(drop=True)
top_recommendations = top_recommendations.sort_values(["Customer ID", "pred_CTR"], ascending=[True, False])
print(top_recommendations[["Customer ID", "Brand", "Model", "Color", "Memory(GB/MB)", "Storage(GB)", "Selling Price", "pred_CTR"]].head(20))

    Customer ID    Brand                   Model          Color  \
0             2     vivo                 Y83 Pro  Nebula Purple   
1             4    Apple          iPhone 7 Plus       Rose Gold   
2             7  Infinix               Hot 7 Pro    Mocha Brown   
3             8   realme                      9i    Prism Black   
4             9   Xiaomi              11 Lite NE  Tuscany Coral   
5            12    Apple         iPhone 12 Mini           Green   
6            12  SAMSUNG         Galaxy J2 Core            Blue   
7            13     OPPO                     A15  Dynamic Black   
8            13    Nokia                  XPlus    Bright Green   
9            14   realme        Narzo 30 Pro 5G    Blade Silver   
10           15  SAMSUNG        Galaxy S6 Edge+   Gold Platinum   
11           17   GIONEE                   P5_W           Black   
12           18    Apple      iPhone 11 Pro Max      Space Grey   
13           18    Nokia  RM-1172 / Nokia 230 DS          Blac

  top_recommendations = test_data.groupby("Customer ID").apply(lambda x: x.nlargest(5, "pred_CTR")).reset_index(drop=True)
