In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import mlflow
import matplotlib.pyplot as plt

In [2]:
path = "./data/"

In [3]:
train = "train.csv"
test = "test.csv"

In [4]:
df = pd.read_csv(path + train)

In [5]:
df.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,1,01-01-2021,1,2,2.0,24-09-2020,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2,01-01-2021,2,1,2.0,19-09-2020,1,0,1,0,0,0,1,0,0,0,0,0,0
2,3,01-01-2021,9,3,3.0,11-08-2021,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,01-01-2021,6,7,2.0,04-10-2017,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,01-01-2021,4,6,,08-06-2020,0,0,0,0,0,0,1,0,0,0,1,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39161 entries, 0 to 39160
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    39161 non-null  int64  
 1   created_at            39161 non-null  object 
 2   campaign_var_1        39161 non-null  int64  
 3   campaign_var_2        39161 non-null  int64  
 4   products_purchased    18250 non-null  float64
 5   signup_date           24048 non-null  object 
 6   user_activity_var_1   39161 non-null  int64  
 7   user_activity_var_2   39161 non-null  int64  
 8   user_activity_var_3   39161 non-null  int64  
 9   user_activity_var_4   39161 non-null  int64  
 10  user_activity_var_5   39161 non-null  int64  
 11  user_activity_var_6   39161 non-null  int64  
 12  user_activity_var_7   39161 non-null  int64  
 13  user_activity_var_8   39161 non-null  int64  
 14  user_activity_var_9   39161 non-null  int64  
 15  user_activity_var_1

In [7]:
df["products_purchased"].fillna(value = 0, inplace = True)

In [8]:
df["signup"] = df["signup_date"].apply(lambda x : 0 if pd.isna(x) else 1 )

In [9]:
df.tail()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy,signup
39156,39157,31-12-2021,11,11,2.0,19-10-2017,1,0,0,0,1,1,1,0,0,0,0,0,0,1
39157,39158,31-12-2021,3,9,3.0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39158,39159,31-12-2021,8,7,2.0,,1,0,0,0,1,0,1,0,0,0,0,0,0,0
39159,39160,31-12-2021,7,12,2.0,,0,0,0,0,0,1,0,0,0,0,1,0,0,0
39160,39161,31-12-2021,2,5,0.0,11-08-2019,1,0,0,0,0,0,1,0,0,0,0,0,0,1


In [10]:
df.columns

Index(['id', 'created_at', 'campaign_var_1', 'campaign_var_2',
       'products_purchased', 'signup_date', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12', 'buy', 'signup'],
      dtype='object')

In [11]:
cols = ['campaign_var_1', 'campaign_var_2','products_purchased','user_activity_var_1','user_activity_var_2', 
        'user_activity_var_3', 'user_activity_var_4','user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
        'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10','user_activity_var_11', 'user_activity_var_12', 
        'signup', 'buy']

In [12]:
df = df[cols]

In [13]:
df.head()

Unnamed: 0,campaign_var_1,campaign_var_2,products_purchased,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,signup,buy
0,1,2,2.0,0,0,0,0,0,0,1,1,0,0,0,0,1,0
1,2,1,2.0,1,0,1,0,0,0,1,0,0,0,0,0,1,0
2,9,3,3.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
3,6,7,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,4,6,0.0,0,0,0,0,0,0,1,0,0,0,1,0,1,0


In [14]:
df["campaign_var_1"].unique()

array([ 1,  2,  9,  6,  4,  3,  5,  7, 13,  8, 12, 10, 14, 11, 16, 15],
      dtype=int64)

In [15]:
df["campaign_var_2"].unique()

array([ 2,  1,  3,  7,  6,  4, 10,  5,  8,  9, 12, 11, 13, 14, 15],
      dtype=int64)

In [16]:
df["products_purchased"].value_counts()

0.0    20911
2.0     8867
3.0     5024
1.0     3643
4.0      716
Name: products_purchased, dtype: int64

In [17]:
df.iloc[:,:-1]

Unnamed: 0,campaign_var_1,campaign_var_2,products_purchased,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,signup
0,1,2,2.0,0,0,0,0,0,0,1,1,0,0,0,0,1
1,2,1,2.0,1,0,1,0,0,0,1,0,0,0,0,0,1
2,9,3,3.0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,6,7,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,4,6,0.0,0,0,0,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39156,11,11,2.0,1,0,0,0,1,1,1,0,0,0,0,0,1
39157,3,9,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0
39158,8,7,2.0,1,0,0,0,1,0,1,0,0,0,0,0,0
39159,7,12,2.0,0,0,0,0,0,1,0,0,0,0,1,0,0


### Preprocessing

In [18]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3,shuffle=True, random_state = 42)

In [20]:
dv = DictVectorizer()

train_dicts = X_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = X_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [21]:
y_train = y_train.values
y_val = y_val.values

In [22]:
import xgboost as xgb 


In [23]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [24]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [25]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("leads")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='leads', tags={}>

In [32]:
def objective(params):

    with mlflow.start_run():

        classifier = xgb.XGBClassifier(n_estimators = params['n_estimators'],
                            max_depth = int(params['max_depth']),
                            learning_rate = params['learning_rate'],
                            gamma = params['gamma'],
                            min_child_weight = params['min_child_weight'],
                            subsample = params['subsample'],
                            colsample_bytree = params['colsample_bytree'],
                            seed= 42,
                            reg_alpha = params['reg_alpha'],
                            reg_lambda = params["reg_lambda"],
                            objective = params["objective"],
                            )

        evaluation = [( X_train, y_train), ( X_val, y_val)]

        classifier.fit(X_train, y_train, eval_set=evaluation,
            early_stopping_rounds=50,verbose=False)

        # Applying k-Fold Cross Validation
        pred = classifier.predict(X_val)
        accuracy = f1_score(y_val, pred)
        print ("SCORE:", accuracy)

        mlflow.xgboost.autolog()
        mlflow.log_metric('cross_val_mean', accuracy)

    return { 'loss': accuracy,'status': STATUS_OK}


In [33]:
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    'objective': 'binary:logistic',
    'seed': 42,
    }

In [None]:
best_result = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)


print("Best: ", best_result)

In [47]:
df_test = pd.read_csv(path +test)

In [48]:
df_test.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12
0,39162,2022-01-01,2,2,,2021-08-17,1,1,0,0,0,1,0,0,0,0,1,0
1,39163,2022-01-01,4,7,3.0,2020-05-21,1,0,0,0,0,0,0,1,0,0,0,0
2,39164,2022-01-01,8,7,,,0,0,0,0,1,1,0,0,0,0,0,0
3,39165,2022-01-01,9,8,2.0,2020-06-22,0,0,0,0,1,1,1,0,0,0,2,0
4,39166,2022-01-01,4,5,2.0,2021-03-10,1,0,0,0,0,0,0,0,0,0,0,0


In [49]:
df_test["products_purchased"].fillna(value = 0, inplace = True)

In [50]:
df_test["signup"] = df_test["signup_date"].apply(lambda x : 0 if pd.isna(x) else 1 )

In [51]:
train_cols = ['campaign_var_1', 'campaign_var_2','products_purchased','user_activity_var_1','user_activity_var_2', 
        'user_activity_var_3', 'user_activity_var_4','user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
        'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10','user_activity_var_11', 'user_activity_var_12', 
        'signup']

In [52]:
X_test = df_test[train_cols].values

In [53]:
params =  {'colsample_bytree': 0.14, 'gamma': 0.37, 'learning_rate': 0.06, 'max_depth': 79.0, 'min_child_weight': 4.0, 
 'n_estimators': 17, 
 'reg_alpha': 0.1683108171968693, 
 'reg_lambda': 0.007616715202973897, 
 'subsample': 0.47000000000000003}

In [54]:
classifier = xgb.XGBClassifier(n_estimators = params['n_estimators'],
                            max_depth = int(params['max_depth']),
                            learning_rate = params['learning_rate'],
                            gamma = params['gamma'],
                            min_child_weight = params['min_child_weight'],
                            subsample = params['subsample'],
                            colsample_bytree = params['colsample_bytree'],
                            seed= 42,
                            reg_alpha = params['reg_alpha'],
                            reg_lambda = params["reg_lambda"],
                            )

classifier.fit(X_train, y_train)

2022/06/03 17:40:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '86c1d7a99a514fe8a98c079adaa9fc36', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


In [55]:
y_pred = classifier.predict(X_test)

In [56]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [59]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [60]:
df_test["buy"] = y_pred

In [62]:
df_test["buy"]

0    12876
1      308
Name: buy, dtype: int64

In [65]:
df_test.to_csv("df_test.csv", index = False)

In [68]:
submission = pd.read_csv(path + "submission.csv")

In [70]:
submission["buy"]= y_pred

In [72]:
submission.to_csv("submission.csv", index = False)