In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time

In [2]:
import matplotlib.pyplot as plt

In [3]:
from functools import *
import json

In [4]:
dat = pd.read_csv("/Users/stevetran/Downloads/VinIDRecruitChallenge/VinIDRecruitChallenge_MLTrack_DataSet.csv")

In [5]:
dat["date"] = pd.to_datetime(dat["date"])

In [6]:
dat["day_of_week"] = dat["date"].map(lambda d: d.day_name())

In [7]:
dat.sort_values(by=["date"],inplace=True)

In [8]:
df_flatten = pd.read_csv("flatten.csv",date_parser=["date"])

In [9]:
df_flatten["date"] = pd.to_datetime(df_flatten["date"])

In [10]:
df_flatten.sort_values(by=["date"],ascending=True,inplace=True)

In [11]:
df_flatten["salesquantity"] = df_flatten["salesquantity"].map(np.ceil).astype('uint')

# Problem:
Predict which customers make at least 1 purchase in a given month using features generated from the 2 previous months

We can rephrase this problem into ML: The probability of repeatable buyers in next month.
We limit the problem that are looked back at least two months data

# Dataset

We have dataset from Feb to July, so we split the dataset
- Training: From (Feb, March, April), (March, April, May)
- Testing: (April,May,(June,July))

## Metric
We evaluate the model using F1 score, False Alarm score

In [12]:
train_dat = df_flatten[df_flatten["date"]<="2018-06-01"].copy(deep=True)
test_dat = df_flatten[df_flatten["date"]>"2018-06-01"].copy(deep=True)

In [13]:
len(train_dat["csn"].unique())

18032

In [14]:
len(test_dat["csn"].unique())

9002

In [15]:
def calculate_prior_items(data):
    last_purchases_user_items = {}
    items_priors_day = np.zeros(len(data),dtype="uint")
    for user_id in data["csn"].unique():
        last_purchases_user_items[user_id] = {}
        
    idx = 0
    for _, row in data.sort_values(by=["date"],ascending=True).iterrows():
        user_id = row["csn"]
        item_id = row["article"]
        timestamp = row["date"]
        delta = 0
        if item_id in last_purchases_user_items[user_id]:        
            last_time_purchase = last_purchases_user_items[user_id][item_id]
            delta = (timestamp - last_time_purchase).days
        last_purchases_user_items[user_id][item_id] = timestamp
        items_priors_day[idx] = delta
        idx+=1
    return items_priors_day

In [16]:
def calculate_dow_features(data):
    features = {}
    for name, group in data.groupby(["csn"]):
        dow_cnt = np.zeros(7,dtype=np.uint)
        for idx in group["date"].map(lambda d: d.dayofweek).unique():
            dow_cnt[idx] += 1        
        
        features[name] = dow_cnt
    return pd.DataFrame(features).T

In [17]:
def calculate_reorder(data):
    features = {}
    for name, group in data.groupby(["csn"]):
        features[name] = {}
        sample = group.copy(deep=True)
        sample["prior_items"] = calculate_prior_items(sample)        
        reorder_products = sample.query("prior_items>0")["article"].nunique()
        total_products = sample["article"].nunique()
        ratio = 1.0 * reorder_products / total_products
        new_products = total_products - reorder_products
        
        features[name]["reorder_rate"] = ratio
        features[name]["new_product"] = int(new_products)
    return pd.DataFrame(features).T

In [18]:
agg_func = ['min','max','sum','std','mean','median']
def compute_last_n_days(data,time_max,lookbacks=[1,2,3,7,14,21,30]):    
    features = {}
    for user_id, group in data.groupby(["csn"]):
        features[user_id] = {}
        tmp = {}
        
#         s1 = time.time()
        group = group.set_index(["date"])
#         s2 = time.time()
        
        for lookback in lookbacks:
            timestamp = time_max - timedelta(days=lookback)
            stats = {
                    "unique_products":0,
                    "total_quantity": 0,
                    "sales": 0
            }
            if timestamp in group.index:
                df_filter = group.loc[timestamp:timestamp]
                stats = {
                    "unique_products":df_filter["article"].nunique(),
                    "total_quantity": df_filter["salesquantity"].sum(),
                    "sales": df_filter["price"].sum()
                }
            for k, v in stats.items():
                k_features = "last_{days}_days_{metric}".format(days=lookback,metric=k)
                features[user_id][k_features] = int(v)
        
#         s3 = time.time()
#         print("Delta: ",(s2-s1),(s3-s2))
        group = group.reset_index()
        
        last_purchase_date = group["date"].max()
        delta_days = (time_max - last_purchase_date).days
        features[user_id]["user_last_purchase_date"] = int(delta_days)
        
        features[user_id]["avg_products_in_cart"] = group.groupby(["date"])["article"].count().mean()
        features[user_id]["number_of_products"] = group.groupby(["date"])["article"].count().sum()
        
        stats = group.groupby(["date"]).agg(
        {
            "salesquantity":agg_func,
            "price":agg_func
        }).mean().reset_index().values
        
        for i in range(len(stats)):
            k1, k2, val = stats[i]
            k = k1+"_"+k2
            features[user_id][k] = val        
            
    return pd.DataFrame(features).T

In [19]:
def get_features_from_df(df_train,df_test,time_max):
    df_last_purchase = compute_last_n_days(df_train,time_max=time_max)
    df_dow_features = calculate_dow_features(df_train)
    df_reorder_features = calculate_reorder(df_train)

    df_reorder_features["new_product"] = df_reorder_features["new_product"].astype('uint')
    df_features = df_last_purchase.join(df_reorder_features).join(df_dow_features)\
        .copy(deep=True)\
        .reset_index().rename({"index":"csn"},axis=1)
    test_csn = set(df_test["csn"].unique())
    df_features["label"] = df_features["csn"].map(lambda d: 1 if d in test_csn else 0)
    print(df_features["label"].value_counts())
    return df_features

## User Features

In [20]:
train_times = ["2018-02-01","2018-04-01","2018-06-01"]

In [21]:
X_total = []
for i in range(len(train_times)-1):
    print("Processing ", train_times[i])
    start = train_times[i]
    end = train_times[i+1]
    train_df = df_flatten[(df_flatten["date"]>=start)&(df_flatten["date"]<end)]
    test_df = df_flatten[df_flatten["date"]==end]
    X_total.append(get_features_from_df(train_df,test_df,time_max=datetime.strptime(end,"%Y-%m-%d")))

Processing  2018-02-01
0    12344
1      940
Name: label, dtype: int64
Processing  2018-04-01
0    11930
1      783
Name: label, dtype: int64


In [22]:
df_features = pd.concat(X_total)

In [26]:
df_features["label"].value_counts()

0    24274
1     1723
Name: label, dtype: int64

In [28]:
d = set(df_flatten[df_flatten["date"]<"2018-04-01"]["csn"].unique())
e = set(df_flatten[df_flatten["date"]=="2018-04-01"]["csn"].unique())

In [32]:
df_features.to_csv("df_features.csv",index=None)

In [209]:
df_features_nan = df_features.fillna(0)

In [210]:
from sklearn.ensemble import RandomForestClassifier

In [211]:
from sklearn.model_selection import cross_val_score

In [212]:
X = df_features_nan.drop(["csn","label"],axis=1).values
y = df_features["label"].values

In [220]:
np.mean(cross_val_score(estimator=RandomForestClassifier(n_estimators=200),X=X,y=y,cv=5,scoring='f1'))

0.7140785194014518

In [254]:
np.mean(cross_val_score(estimator=lightgbm.LGBMClassifier(n_estimators=200,
                                                          subsample=0.5,                                                          
                                                          learning_rate=0.01)
                        ,X=X,y=y,cv=5,scoring='f1'))


0.7173650737412289

In [227]:
from sklearn.linear_model import LogisticRegression

In [235]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures

In [239]:
X_transform = PolynomialFeatures(degree=2,include_bias=True).fit_transform(MinMaxScaler().fit_transform(X))

In [253]:
np.mean(cross_val_score(estimator=LogisticRegression(max_iter=10000),X=X_transform,y=y,cv=5,scoring='f1'))

0.7223754353136805

In [241]:
from sklearn.svm import LinearSVC, SVC

In [251]:
np.mean(cross_val_score(estimator=LinearSVC(verbose=1,max_iter=10000),X=X_transform,y=y,cv=5,scoring='f1'))

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

0.7226427842227222

In [244]:
rf = RandomForestClassifier().fit(X,y)

In [249]:
sorted(list(zip(df_features.drop(["csn","label"],axis=1).columns,rf.feature_importances_))
       ,key=lambda d: d[-1],reverse=True)

[('user_last_purchase_date', 0.09270921295688098),
 ('total_price', 0.08040146923220215),
 ('new_product', 0.07458550040192596),
 ('reorder_rate', 0.069687772341471),
 ('total_quantity', 0.06851739927885359),
 ('avg_products_in_cart', 0.05517071891527061),
 ('min_price', 0.053651661224843425),
 ('max_price', 0.051624950769237433),
 ('median_price', 0.051318421020260636),
 ('std_price', 0.05086377695754538),
 ('avg_price', 0.050302122472754396),
 ('std_quantity', 0.04068498468958489),
 ('avg_quantity', 0.039990677890787904),
 ('max_quantity', 0.026872590759219127),
 (4, 0.018710782829700585),
 (2, 0.01823570638784496),
 (5, 0.016400606374980926),
 (0, 0.014440114683809247),
 (1, 0.012550201819167332),
 (3, 0.012118820674511845),
 (6, 0.009807469623870146),
 ('median_quantity', 0.007873396913385313),
 ('last_7_days_sales', 0.004471364986592645),
 ('last_21_days_unique_products', 0.004403468603888407),
 ('last_35_days_sales', 0.004399784241720509),
 ('last_21_days_sales', 0.00430863436485