<h3 align='center'> FAI Group F2 Purchase Prediction </h3>

## 1. Data Collection 

### 1.1 Import necessary packages 

In [2]:
import os
import sys
import argparse
import time
import datetime
import warnings
import logging
import numpy as np
import pandas as pd
import nltk
import joblib
import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

warnings.filterwarnings("ignore")

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

### 1.2 Read the data 

In [3]:
data=pd.read_csv('data\data.csv',encoding='unicode-escape')
print(data.shape)
data.sample(5)

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
495459,578305,22784,LANTERN CREAM GAZEBO,18,11/23/2011 15:44,4.25,14088.0,United Kingdom
205785,554837,22784,LANTERN CREAM GAZEBO,2,5/26/2011 16:30,4.95,14584.0,United Kingdom
330472,565919,22117,METAL SIGN HER DINNER IS SERVED,2,9/7/2011 16:48,5.79,,United Kingdom
86389,543546,22268,EASTER DECORATION SITTING BUNNY,1,2/9/2011 16:52,1.63,,United Kingdom
214074,555564,82581,TOILET METAL SIGN,3,6/5/2011 15:01,0.55,15005.0,United Kingdom


In [4]:
# More information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
data['InvoiceNo'].unique(), data['StockCode'].unique()

(array(['536365', '536366', '536367', ..., '581585', '581586', '581587'],
       dtype=object),
 array(['85123A', '71053', '84406B', ..., '90214U', '47591b', '23843'],
       dtype=object))

- The Online Retail file has 541909 entries and 8 columns. There 4 object datatypes,`InvoiceNo`, `StockCode`, `Description` and `Country`, 1 interger `Quantity` and 2 floats `UnitPrice` and `CustomerID`. There are missing values in the `CustomerID` column.

### 2. Data Wrangling and Cleaning

In [6]:
# Declaring the path where the Models will be saved and loaded from
rfc_model = 'model/rfc_model.joblib'
knn_model = 'model/knn_model.joblib'
dtc_model = 'model/dtc_model.joblib'

In [7]:
# Hyperparamter tuning, saving & loading of Machine Learning models
def hyperparameter_tuning(algorithm, param_grid, kFold):
    "Hyper parameter tuning for the given algorithms"
    gcv = GridSearchCV(algorithm, param_grid=parameters, cv=kFold, verbose=10, n_jobs=-1)
    model = gcv.fit(X_train,Y_train)
    estimator = gcv.best_estimator_
    return estimator, model

def save_model(estimator, modelname):
    print("Saving the model ...")
    try:
        dump(estimator, modelname)
    except Exception as e:
        # Exception handler, alert the user
        raise IOError("Error saving model data to disk: {}".format(str(e))) from e

def load_model(modelname):
    try:
        print("Model loading ...")
        loaded_model = load(modelname)
    except Exception as e:
        raise IOError("Error loading model data from disk: {}".format(str(e))) from e
    return loaded_model

def iter_minibatches(chunksize):
    # Provide chunks one by one
    chunkstartmarker = 0
    #while chunkstartmarker < numtrainingpoints:
    while chunkstartmarker < len(X_train):
        #chunkrows = range(chunkstartmarker,chunkstartmarker+chunksize)
        chunkrows = chunkstartmarker+chunksize
        X_chunk, Y_chunk = X_train[chunkstartmarker:chunkrows], Y_train[chunkstartmarker:chunkrows]
        yield X_chunk, Y_chunk
        chunkstartmarker += chunksize

In [8]:
## helper class definition ???
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-rtd',
                        '--raw_train_data',
                        type=str,
                        required=False,
                        default='data/data.csv',
                        help='raw data csv file, if this parameter is specified then it will only perform the data preparation part')
    parser.add_argument('-daf',
                        '--data_aug_factor',
                        type=int,
                        required=False,
                        default='0',
                        help='data augmentation/multiplication factor, requires --raw-train-data parameter')
    parser.add_argument('-ftd',
                        '--final_train_data',
                        type=str,
                        required=False,
                        default='0',
                        help='final filtered data csv file, if this parameter is specified then it will skip the data preparation part')
    parser.add_argument('-t',
                        '--tuning',
                        type=str,
                        required=False,
                        default='0',
                        help='hyper parameter tuning (0/1). Along with Hyperparamter tuning, the model is saved ')
    parser.add_argument('-s',
                        '--stock',
                        type=str,
                        required=False,
                        default='0',
                        help='Use Stock Python (0/1)')
    parser.add_argument('-alg',
                        '--algorithm',
                        type=str,
                        required=False,
                        default='knn',
                        help='scikit learn classifier algorithm to be used (knn,dtc,rfc) \
                        - knn=KNearestNeighborClassifier, dtc=DecisionTreeClassifier, rfc=RandomForestClassifier')
    parser.add_argument('-b',
                        '--batch_size',
                        type=int,
                        required=False,
                        default=None,
                        help='hyper parameter tuning (0/1). Along with Hyperparamter tuning, the model is saved ')
    parser.add_argument('-inf',
                        '--inference',
                        type=str,
                        required=False,
                        default='0',
                        help='Perform Inference on the saved models.Specify the model file with path i.e knn_model or rfc_model or dtc_model')

In [9]:
# variables definitions ???
FLAGS = parser.parse_args()
    # pkg = FLAGS.package
finaltraindata = FLAGS.final_train_data
dataaugfactor = FLAGS.data_aug_factor
rawtraindata = FLAGS.raw_train_data
algorithm = FLAGS.algorithm
inference = FLAGS.inference
tuning = True if FLAGS.tuning == '1' else False
stock = True if FLAGS.stock == '1' else False
batch_size = FLAGS.batch_size

prgstime = time.time()

In [10]:
# import sklearn and intel packages

if stock is False:
    from sklearnex import patch_sklearn  # pylint: disable=import-error
    patch_sklearn()
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score
    from sklearn import model_selection
    from sklearn import metrics
    from sklearn.model_selection import GridSearchCV
    from sklearn import neighbors
    from sklearn import linear_model

    from sklearn import tree
    from sklearn import ensemble
    from sklearn.decomposition import PCA
    from joblib import load, dump

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [12]:
# read the csv file

df_initial = pd.read_csv('data/data.csv', encoding="ISO-8859-1",
                                     dtype={'CustomerID': str, 'InvoiceID': str})
    
print('Dataframe dimensions:', df_initial.shape)    
    

Dataframe dimensions: (541909, 8)


In [13]:
# check for null values

df_initial['InvoiceDate'] = pd.to_datetime(df_initial['InvoiceDate'])
        # ____________________________________________________________
        # gives some infos on columns types and numer of null values
tab_info = pd.DataFrame(df_initial.dtypes).T.rename(index={0: 'column type'})
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0: 'null values (nb)'}))
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum() / df_initial.shape[0] * 100).T.
                                   rename(index={0: 'null values (%)'}))
df_initial.dropna(axis=0, subset=['CustomerID'], inplace=True)
print('Dataframe dimensions:', df_initial.shape)

Dataframe dimensions: (406829, 8)


In [14]:
# gives some information on columns types and number of null values
tab_info = pd.DataFrame(df_initial.dtypes).T.rename(index={0: 'column type'})
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0: 'null values (nb)'}))
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum() / df_initial.shape[0] * 100).T.
                                   rename(index={0: 'null values (%)'}))
print('Duplicate entries: {}'.format(df_initial.duplicated().sum()))
df_initial.drop_duplicates(inplace=True)

Duplicate entries: 5225


In [15]:
temp = df_initial[['CustomerID', 'InvoiceNo', 'Country']].groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
temp = temp.reset_index(drop=False)
countries = temp['Country'].value_counts()
countries

United Kingdom          19857
Germany                   603
France                    458
EIRE                      319
Belgium                   119
Spain                     105
Netherlands               101
Switzerland                71
Portugal                   70
Australia                  69
Italy                      55
Finland                    48
Sweden                     46
Norway                     40
Channel Islands            33
Japan                      28
Poland                     24
Denmark                    21
Cyprus                     20
Austria                    19
Singapore                  10
Malta                      10
Unspecified                 8
USA                         7
Iceland                     7
Israel                      6
Canada                      6
Greece                      6
Czech Republic              5
European Community          5
Lithuania                   4
United Arab Emirates        3
Saudi Arabia                2
Bahrain   

- In the data, there are 36 countries and 1 unspecified entry. Most of customers are from the UK.

### 2.1 Feature Engineering

In [16]:
# make temporary df showing CustomerID, InvoiceNo, and InvoiceDate by count
pd.DataFrame([{'products': len(df_initial['StockCode'].value_counts()),
                       'transactions': len(df_initial['InvoiceNo'].value_counts()),
                       'customers': len(df_initial['CustomerID'].value_counts()),
                       }], columns=['products', 'transactions', 'customers'], index=['quantity']
                     )
temp = df_initial.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()

temp[:5] # first 5 entries of temp df

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate
0,12346,541431,1
1,12346,C541433,1
2,12347,537626,31
3,12347,542237,29
4,12347,549222,24


In [17]:
# from the invoice date get the number of cancelled orders
nb_products_per_basket = temp.rename(columns={'InvoiceDate': 'Number of products'})
nb_products_per_basket[:10].sort_values('CustomerID')

nb_products_per_basket['order_canceled'] = nb_products_per_basket['InvoiceNo'].apply(lambda x: int('C' in x))

n1 = nb_products_per_basket['order_canceled'].sum()
n2 = nb_products_per_basket.shape[0]

print(n1, n2) # number of orders cancelled vs ordered in the basket

3654 22190


In [18]:
# make df_check to check the quantity that was less than 0
df_check = df_initial[df_initial['Quantity'] < 0][['CustomerID', 'Quantity',
                                                           'StockCode', 'Description', 'UnitPrice']]
for index, col in df_check.iterrows():
    if df_initial[(df_initial['CustomerID'] == col[0]) & (df_initial['Quantity'] == -col[1]) &
                          (df_initial['Description'] == col[2])].shape[0] == 0:
        break

df_check = df_initial[(df_initial['Quantity'] < 0) & (df_initial['Description'] != 'Discount')][
                                         ['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]

df_check[:5] # 1st five

Unnamed: 0,CustomerID,Quantity,StockCode,Description,UnitPrice
154,15311,-1,35004C,SET OF 3 COLOURED FLYING DUCKS,4.65
235,17548,-12,22556,PLASTERS IN TIN CIRCUS PARADE,1.65
236,17548,-24,21984,PACK OF 12 PINK PAISLEY TISSUES,0.29
237,17548,-24,21983,PACK OF 12 BLUE PAISLEY TISSUES,0.29
238,17548,-24,21980,PACK OF 12 RED RETROSPOT TISSUES,0.29


In [19]:
# clean the df by introducing a new column of cancelled orders and give them a value of 0
for index, col in df_check.iterrows():
    if df_initial[(df_initial['CustomerID'] == col[0]) & (df_initial['Quantity'] == -col[1]) &
                          (df_initial['Description'] == col[2])].shape[0] == 0:
        break

df_cleaned = df_initial.copy(deep=True)
df_cleaned['QuantityCanceled'] = 0

df_cleaned[:5]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,QuantityCanceled
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0


In [20]:
# clean the df by removing doubtful entries

entry_to_remove = []
doubtful_entry = []

for index, col in df_initial.iterrows():
    if (col['Quantity'] > 0) or col['Description'] == 'Discount':
        continue
    df_test = df_initial[(df_initial['CustomerID'] == col['CustomerID'])
                         & (df_initial['StockCode'] == col['StockCode'])
                         & (df_initial['InvoiceDate'] < col['InvoiceDate'])
                         & (df_initial['Quantity'] > 0)].copy()
            # _________________________________
            # Cancelation WITHOUT counterpart
    if df_test.shape[0] == 0:
        doubtful_entry.append(index)
            # ________________________________
            # Cancelation WITH a counterpart
    elif df_test.shape[0] == 1:
        index_order = df_test.index[0]
        df_cleaned.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
        entry_to_remove.append(index)
            # ______________________________________________________________
            # Various counterparts exist in orders: we delete the last one
    elif df_test.shape[0] > 1:
        df_test.sort_index(axis=0, ascending=False, inplace=True)
        for ind, val in df_test.iterrows():
            if val['Quantity'] < -col['Quantity']:
                continue
            df_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
            entry_to_remove.append(index)
            break

df_cleaned.drop(entry_to_remove, axis=0, inplace=True)
df_cleaned.drop(doubtful_entry, axis=0, inplace=True)
remaining_entries = df_cleaned[(df_cleaned['Quantity'] < 0) & (df_cleaned['StockCode'] != 'D')]

remaining_entries[:5] # 1st five remaining


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,QuantityCanceled
77598,C542742,84535B,FAIRY CAKES NOTEBOOK A6 SIZE,-94,2011-01-31 16:26:00,0.65,15358,United Kingdom,0
90444,C544038,22784,LANTERN CREAM GAZEBO,-4,2011-02-15 11:32:00,4.95,14659,United Kingdom,0
111968,C545852,22464,HANGING METAL HEART LANTERN,-5,2011-03-07 13:49:00,1.65,14048,United Kingdom,0
116064,C546191,47566B,TEA TIME PARTY BUNTING,-35,2011-03-10 10:57:00,0.7,16422,United Kingdom,0
132642,C547675,22263,FELT EGG COSY LADYBIRD,-49,2011-03-24 14:07:00,0.66,17754,United Kingdom,0


In [21]:
# introduce the total price by calculation, multiply UnitPrice with Quantity and subtract the QuantityCanceled

# df_cleaned[(df_cleaned['CustomerID'] == 14048) & (df_cleaned['StockCode'] == '22464')]
list_special_codes = df_cleaned[df_cleaned['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
df_cleaned['TotalPrice'] = df_cleaned['UnitPrice'] * (df_cleaned['Quantity'] - df_cleaned['QuantityCanceled'])

df_cleaned.sort_values('CustomerID')[:5] # cleaned sorted values

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,QuantityCanceled,TotalPrice
61619,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:01:00,1.04,12346,United Kingdom,74215,0.0
148288,549222,22375,AIRLINE BAG VINTAGE JET SET BROWN,4,2011-04-07 10:43:00,4.25,12347,Iceland,0,17.0
428971,573511,22698,PINK REGENCY TEACUP AND SAUCER,12,2011-10-31 12:25:00,2.95,12347,Iceland,0,35.4
428970,573511,47559B,TEA TIME OVEN GLOVE,10,2011-10-31 12:25:00,1.25,12347,Iceland,0,12.5
428969,573511,47567B,TEA TIME KITCHEN APRON,6,2011-10-31 12:25:00,5.95,12347,Iceland,0,35.7


In [22]:
# Rename TotalPrice to BasketPrice and choose Prices above 0

# somme des achats / utilisateur & commande
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
basket_price = temp.rename(columns={'TotalPrice': 'Basket Price'})
        # date de la commande
df_cleaned['InvoiceDate_int'] = df_cleaned['InvoiceDate'].astype('int64')
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_cleaned.drop('InvoiceDate_int', axis=1, inplace=True)
basket_price.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
        # selection des entrées significatives:
basket_price = basket_price[basket_price['Basket Price'] > 0]

basket_price.sort_values('CustomerID')[:5] # basket price sorted values


Unnamed: 0,CustomerID,InvoiceNo,Basket Price,InvoiceDate
1,12347,537626,711.79,2010-12-07 14:57:00.000000000
2,12347,542237,475.39,2011-01-26 14:29:59.999999744
3,12347,549222,636.25,2011-04-07 10:43:00.000000000
4,12347,556201,382.52,2011-06-09 13:01:00.000000000
5,12347,562032,584.91,2011-08-02 08:48:00.000000000


In [23]:
# Define price ranges to work with from the BasketPrice, initialize empty count_price list and append the prices to the list

# Décompte des achats
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
    if i == 0:
        continue
    val = basket_price[(basket_price['Basket Price'] < price) & (basket_price['Basket Price'] > price_range[i - 1])]['Basket Price'].count()
    count_price.append(val)

### 2.3 NLP

In [24]:
# make function to check for nouns
def is_noun(pos):
    '''Noun validation'''
    return pos[:2] == 'NN'

        # is_noun = lambda pos: pos[:2] == 'NN'

In [25]:
# make a keywords inventory function
def keywords_inventory(dataframe, colonne='Description'):
    '''Stemming'''
    stemmer = nltk.stem.SnowballStemmer("english")
    keywords_roots = dict()  # collect the words / root
    keywords_select = dict()  # association: root <-> keyword
    category_keys = []
    count_keywords = dict()
            # icount = 0
    for s in dataframe[colonne]:
        if pd.isnull(s):
            continue
        lines = s.lower()
        tokenized = nltk.word_tokenize(lines)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]

        for t in nouns:
            t = t.lower()
            racine = stemmer.stem(t)
            if racine in keywords_roots:
                keywords_roots[racine].add(t)
                count_keywords[racine] += 1
            else:
                keywords_roots[racine] = {t}
                count_keywords[racine] = 1

    for s in keywords_roots.keys():
        if len(keywords_roots[s]) > 1:
            min_length = 1000
            for k in keywords_roots[s]:
                if len(k) < min_length:
                    clef = k
                    min_length = len(k)
            category_keys.append(clef)
            keywords_select[s] = clef
        else:
            category_keys.append(list(keywords_roots[s])[0])
            keywords_select[s] = list(keywords_roots[s])[0]
            
    return category_keys, keywords_roots, keywords_select, count_keywords


In [26]:
# make a df isolating only the product description
df_produits = pd.DataFrame(df_initial['Description'].unique()).rename(columns={0: 'Description'})

keywords, keywords_roots, keywords_select, count_keywords = keywords_inventory(df_produits)

df_produits[:2] # 1st 2 products

Unnamed: 0,Description
0,WHITE HANGING HEART T-LIGHT HOLDER
1,WHITE METAL LANTERN


In [27]:
# intialize empty list_products list and append keywords from Description to it

list_products = []

for k, v in count_keywords.items():
    list_products.append([keywords_select[k], v])
list_products.sort(key=lambda x: x[1], reverse=True)

liste = sorted(list_products, key=lambda x: x[1], reverse=True)

list_products = []
for k, v in count_keywords.items():
    word = keywords_select[k]
    if word in ['pink', 'blue', 'tag', 'green', 'orange']:
        continue
    if len(word) < 3 or v < 13:
        continue
    if ('+' in word) or ('/' in word):
        continue
    list_products.append([word, v])
        # ______________________________________________________
list_products.sort(key=lambda x: x[1], reverse=True)

liste_produits = df_cleaned['Description'].unique()

liste_produits[:2] # 1st 2

array(['WHITE HANGING HEART T-LIGHT HOLDER', 'WHITE METAL LANTERN'],
      dtype=object)

In [28]:
list_products[:3] # list of products

[['heart', 267], ['vintage', 211], ['set', 206]]

In [29]:
# make 

X = pd.DataFrame()
for key, occurence in list_products:
    X.loc[:, key] = list(map(lambda x: int(key.upper() in x), liste_produits))

threshold = [0, 1, 2, 3, 5, 10]
label_col = []
for i in range(len(threshold)):
    if i == len(threshold) - 1:
        col = '.>{}'.format(threshold[i])
    else:
        col = '{}<.<{}'.format(threshold[i], threshold[i + 1])
    label_col.append(col)
    X.loc[:, col] = 0

for i, prod in enumerate(liste_produits):
    prix = df_cleaned[df_cleaned['Description'] == prod]['UnitPrice'].mean()
    j = 0
    while prix > threshold[j]:
        j = j + 1
        if j == len(threshold):
            break
    X.loc[i, label_col[j - 1]] = 1

for i in range(len(threshold)):
    if i == len(threshold) - 1:
        col = '.>{}'.format(threshold[i])
    else:
        col = '{}<.<{}'.format(threshold[i], threshold[i + 1])


### 3. Preprocessing

In [30]:
# transform X to a matrix and use KMeans to make 5 clusters

matrix = X.to_numpy()  # as_matrix()
for n_clusters in range(3, 10):
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=30)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)

n_clusters = 5
silhouette_avg = -1
while silhouette_avg < 0.145:
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=30)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)

k_value_c = pd.Series(clusters).value_counts()    
#pd.Series(clusters).value_counts()

k_value_c ## look at the series

INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.K

3    1159
1     964
4     673
0     606
2     476
dtype: int64

In [31]:
# define individual silouhette scores
sample_silhouette_values = silhouette_samples(matrix, clusters)

liste = pd.DataFrame(liste_produits)
liste_words = [word for (word, occurence) in list_products]

occurence = [dict() for _ in range(n_clusters)]

for i in range(n_clusters):
    liste_cluster = liste.loc[clusters == i]
    for word in liste_words:
        if word in ['art', 'set', 'heart', 'pink', 'blue', 'tag']:
            continue
        occurence[i][word] = sum(liste_cluster.loc[:, 0].str.contains(word.upper()))

In [32]:
# use PCA to reduce dimensions
pca = PCA()
pca.fit(matrix)
pca_samples = pca.transform(matrix)

pca_samples

INFO:root:sklearn.decomposition.PCA.fit: running accelerated version on CPU
INFO:root:sklearn.decomposition.PCA.transform: running accelerated version on CPU


array([[ 4.04146662e-02, -4.97230972e-01,  8.86108601e-02, ...,
        -3.53393613e-04,  2.11123394e-03, -3.81639165e-17],
       [-1.82800408e-02, -4.22521729e-01,  5.60009834e-01, ...,
        -3.28279183e-04,  1.73540170e-03, -1.90819582e-16],
       [ 1.56363862e-02, -3.15742200e-01,  1.24744354e+00, ...,
         2.89423998e-03, -2.12892191e-03, -7.45931095e-16],
       ...,
       [-7.00204037e-01,  4.33656636e-01, -1.21753185e-01, ...,
        -1.23958283e-03, -7.00072063e-03,  1.56125113e-16],
       [ 4.64950132e-02, -5.08854136e-01,  1.11871933e-01, ...,
        -4.64582303e-03, -4.59710854e-05, -5.37764278e-16],
       [-3.63126969e-02, -6.33805390e-01, -6.04740504e-01, ...,
        -9.91835682e-05,  9.88603744e-04,  4.51028104e-17]])

In [33]:
pca = PCA(n_components=50)
matrix_9D = pca.fit_transform(matrix)
mat = pd.DataFrame(matrix_9D)
mat['cluster'] = pd.Series(clusters)

mat['cluster'][:10]##1st 10

INFO:root:sklearn.decomposition.PCA.fit: running accelerated version on CPU
INFO:root:sklearn.decomposition.PCA.fit_transform: running accelerated version on CPU


0    4
1    0
2    0
3    0
4    0
5    2
6    0
7    4
8    3
9    3
Name: cluster, dtype: int32

In [34]:
# from the 5 clusters create 5 categories, from the main category 
corresp = dict()
for key, val in zip(liste_produits, clusters):
    corresp[key] = val
        # __________________________________________________________________
df_cleaned['categ_product'] = df_cleaned.loc[:, 'Description'].map(corresp)

for i in range(5):
    col = 'categ_{}'.format(i)
    df_temp = df_cleaned[df_cleaned['categ_product'] == i]
    price_temp = df_temp['UnitPrice'] * (df_temp['Quantity'] - df_temp['QuantityCanceled'])
    price_temp = price_temp.apply(lambda x: x if x > 0 else 0)
    df_cleaned.loc[:, col] = price_temp
    df_cleaned[col].fillna(0, inplace=True)

In [35]:
df_cleaned[['InvoiceNo', 'Description', 'categ_product', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']][:5] ### look at these columns

Unnamed: 0,InvoiceNo,Description,categ_product,categ_0,categ_1,categ_2,categ_3,categ_4
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,4,0.0,0.0,0.0,0.0,15.3
1,536365,WHITE METAL LANTERN,0,20.34,0.0,0.0,0.0,0.0
2,536365,CREAM CUPID HEARTS COAT HANGER,0,22.0,0.0,0.0,0.0,0.0
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,0,20.34,0.0,0.0,0.0,0.0
4,536365,RED WOOLLY HOTTIE WHITE HEART.,0,20.34,0.0,0.0,0.0,0.0


In [36]:
#

# somme des achats / utilisateur & commande
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
basket_price = temp.rename(columns={'TotalPrice': 'Basket Price'})
        # ___________________________________________________________
        # pourcentage du prix de la commande / categorie de produit
for i in range(5):
    col = 'categ_{}'.format(i)
    temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)[col].sum()
    basket_price.loc[:, col] = temp[col]

In [37]:
# merge the categories with the customerID df

# date de la commande
df_cleaned['InvoiceDate_int'] = df_cleaned['InvoiceDate'].astype('int64')
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_cleaned.drop('InvoiceDate_int', axis=1, inplace=True)
basket_price.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
        # ______________________________________
        # selection des entrées significatives:
basket_price = basket_price[basket_price['Basket Price'] > 0]

basket_price.sort_values('CustomerID', ascending=True)[:5] ## basket price sorted

Unnamed: 0,CustomerID,InvoiceNo,Basket Price,categ_0,categ_1,categ_2,categ_3,categ_4,InvoiceDate
1,12347,537626,711.79,293.35,23.4,124.44,187.2,83.4,2010-12-07 14:57:00.000000000
2,12347,542237,475.39,169.2,84.34,0.0,168.75,53.1,2011-01-26 14:29:59.999999744
3,12347,549222,636.25,115.0,81.0,0.0,369.15,71.1,2011-04-07 10:43:00.000000000
4,12347,556201,382.52,168.76,41.4,19.9,74.4,78.06,2011-06-09 13:01:00.000000000
5,12347,562032,584.91,158.16,61.3,97.8,147.95,119.7,2011-08-02 08:48:00.000000000


In [38]:
# make a new df, consisting of customer, transations(aggregated) and the categories. To see the customer spent habit

set_entrainement = basket_price[basket_price['InvoiceDate'] < pd.to_datetime(datetime.date(2011, 10, 1))]
set_test = basket_price[basket_price['InvoiceDate'] >= pd.to_datetime(datetime.date(2011, 10, 1))]
basket_price = set_entrainement.copy(deep=True)

        # nb de visites et stats sur le montant du panier / utilisateurs
transactions_per_user = basket_price.groupby(by=['CustomerID'])['Basket Price'].agg(['count', 'min', 'max', 'mean', 'sum'])
for i in range(5):
    col = 'categ_{}'.format(i)
    transactions_per_user.loc[:, col] = basket_price.groupby(by=['CustomerID'])[col].sum() / transactions_per_user['sum'] * 100

transactions_per_user.reset_index(drop=False, inplace=True)
basket_price.groupby(by=['CustomerID'])['categ_0'].sum()

transactions_per_user.sort_values('CustomerID', ascending=True)[:5] ### sorted user transactions

Unnamed: 0,CustomerID,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,categ_4
0,12347,5,382.52,711.79,558.172,2790.86,32.40829,10.442659,8.676179,33.948317,14.524555
1,12348,4,227.44,892.8,449.31,1797.24,0.0,38.016069,0.0,61.983931,0.0
2,12350,1,334.4,334.4,334.4,334.4,0.0,11.692584,0.0,60.406699,27.900718
3,12352,6,144.35,840.3,345.663333,2073.98,15.711338,0.491808,14.301006,66.125517,3.370331
4,12353,1,89.0,89.0,89.0,89.0,0.0,0.0,22.359551,57.752809,19.88764


In [39]:
# introduce the date, to the above df 
last_date = basket_price['InvoiceDate'].max().date()

first_registration = pd.DataFrame(basket_price.groupby(by=['CustomerID'])['InvoiceDate'].min())
last_purchase = pd.DataFrame(basket_price.groupby(by=['CustomerID'])['InvoiceDate'].max())

test = first_registration.applymap(lambda x: (last_date - x.date()).days)
test2 = last_purchase.applymap(lambda x: (last_date - x.date()).days)

transactions_per_user.loc[:, 'LastPurchase'] = test2.reset_index(drop=False)['InvoiceDate']
transactions_per_user.loc[:, 'FirstPurchase'] = test.reset_index(drop=False)['InvoiceDate']

transactions_per_user[:5] ### user transactions

Unnamed: 0,CustomerID,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,categ_4,LastPurchase,FirstPurchase
0,12347,5,382.52,711.79,558.172,2790.86,32.40829,10.442659,8.676179,33.948317,14.524555,59,297
1,12348,4,227.44,892.8,449.31,1797.24,0.0,38.016069,0.0,61.983931,0.0,5,288
2,12350,1,334.4,334.4,334.4,334.4,0.0,11.692584,0.0,60.406699,27.900718,240,240
3,12352,6,144.35,840.3,345.663333,2073.98,15.711338,0.491808,14.301006,66.125517,3.370331,2,226
4,12353,1,89.0,89.0,89.0,89.0,0.0,0.0,22.359551,57.752809,19.88764,134,134


In [40]:
# see the number of transations per user 
n1 = transactions_per_user[transactions_per_user['count'] == 1].shape[0]
n2 = transactions_per_user.shape[0]

print(n1, n2) # see transactions

1445 3608


In [41]:
# make a list with the countsand categories and transform it to a matrix
list_cols = ['count', 'min', 'max', 'mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']
        # ________________________________________________________
selected_customers = transactions_per_user.copy(deep=True)
matrix = selected_customers[list_cols].to_numpy()

matrix[:2] ## look 

array([[  5.        , 382.52      , 711.79      , 558.172     ,
         32.40828992,  10.44265925,   8.67617867,  33.94831701,
         14.52455516],
       [  4.        , 227.44      , 892.8       , 449.31      ,
          0.        ,  38.01606908,   0.        ,  61.98393092,
          0.        ]])

- The values in the matrix have varying scales.

In [42]:
# Normalize

scaler = StandardScaler()
scaler.fit(matrix)
scaled_matrix = scaler.transform(matrix)

In [43]:
# Reduce dimensions/size

pca = PCA()
pca.fit(scaled_matrix)
pca_samples = pca.transform(scaled_matrix)

INFO:root:sklearn.decomposition.PCA.fit: running accelerated version on CPU
INFO:root:sklearn.decomposition.PCA.transform: running accelerated version on CPU


In [44]:
# define clusters

n_clusters = 11
kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100)
kmeans.fit(scaled_matrix)
clusters_clients = kmeans.predict(scaled_matrix)
silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)

pd.DataFrame(pd.Series(clusters_clients).value_counts(), columns=['nb. of clients']).T ## the df


INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU


Unnamed: 0,1,0,7,10,2,4,6,9,5,3,8
nb. of clients,1551,502,334,295,291,265,191,153,10,9,7


- the number of clients in each cluster.

In [45]:
# use 6 k
pca = PCA(n_components=6)
matrix_3D = pca.fit_transform(scaled_matrix)
mat = pd.DataFrame(matrix_3D)
mat['cluster'] = pd.Series(clusters_clients)

mat['cluster'][:10] ### 1st 10 groups

INFO:root:sklearn.decomposition.PCA.fit: running accelerated version on CPU
INFO:root:sklearn.decomposition.PCA.fit_transform: running accelerated version on CPU


0    1
1    0
2    0
3    0
4    0
5    6
6    0
7    6
8    7
9    6
Name: cluster, dtype: int32

In [46]:
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
        # define individual silouhette scores
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)

selected_customers.loc[:, 'cluster'] = clusters_clients

In [47]:
# merged df customers by cluster

merged_df = pd.DataFrame()
for i in range(n_clusters):
    test = pd.DataFrame(selected_customers[selected_customers['cluster'] == i].mean())
    test = test.T.set_index('cluster', drop=True)
    test['size'] = selected_customers[selected_customers['cluster'] == i].shape[0]
    merged_df = pd.concat([merged_df, test])
        # _____________________________________________________
merged_df.drop('CustomerID', axis=1, inplace=True)

merged_df = merged_df.sort_values('sum')

merged_df 

Unnamed: 0_level_0,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,categ_4,LastPurchase,FirstPurchase,size
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4.0,2.260377,193.966755,316.250415,247.336663,593.788566,6.284647,54.848316,5.4676,22.081663,11.317774,96.018868,165.969811,265
7.0,2.482036,191.252725,307.080689,243.361162,620.928204,12.991371,5.21424,53.155612,17.160023,11.495649,121.497006,190.401198,334
0.0,2.432271,198.114363,326.923946,255.382523,661.292114,6.594482,8.875318,7.387286,66.451745,10.694135,110.741036,179.149402,502
2.0,2.14433,202.483505,339.846014,264.699716,665.022405,51.272885,6.695432,10.81085,17.008286,14.265775,109.975945,170.539519,291
10.0,2.589831,211.130949,384.197458,293.812579,825.519864,7.122882,7.007173,7.362787,17.702003,60.805155,101.39661,193.284746,295
1.0,3.241779,223.412658,458.316222,331.950411,1081.924025,17.118909,13.616554,14.75253,32.757031,21.758797,81.972276,196.96325,1551
6.0,1.696335,1061.902932,1395.93189,1214.217326,2156.051576,17.198217,12.289439,13.678848,35.663985,21.169865,93.329843,141.39267,191
3.0,1.222222,3901.461111,4064.098889,3982.78,5032.047778,14.531229,26.370713,19.521209,27.724847,11.852002,62.888889,95.333333,9
9.0,18.27451,86.385098,1605.565556,574.686413,9949.940131,16.274523,12.143126,15.549971,30.4977,25.55516,17.189542,278.960784,153
8.0,92.0,10.985714,1858.25,374.601553,34845.105714,13.402971,13.117583,17.721038,33.256402,22.527857,1.428571,302.285714,7


In [48]:

liste_index = []
for i in range(5):
    COLUMN = f'categ_{i}'
    liste_index.append(merged_df[merged_df[COLUMN] > 45].index.values[0])
        # ___________________________________
liste_index_reordered = liste_index
liste_index_reordered += [s for s in merged_df.index if s not in liste_index]
        # __________________________________________________________
merged_df = merged_df.reindex(index=liste_index_reordered)
merged_df = merged_df.reset_index(drop=False)

if dataaugfactor != 0:
    selected_customers = pd.concat([selected_customers] * dataaugfactor, ignore_index=True)
else:
    selected_customers = pd.concat([selected_customers], ignore_index=True)

In [49]:
 # Save to a csv file
p, file = os.path.split(rawtraindata)
split_tup = os.path.splitext(file)
file_name = split_tup[0]
file_extension = split_tup[1]

if dataaugfactor > 0:
    SUFFIX = f'_aug_{dataaugfactor}'
else:
    SUFFIX = '_aug'
newdatafile = os.path.join(p, file_name + SUFFIX + file_extension)
print(f'Saving final filtered data to a csv file {newdatafile}...')
selected_customers.to_csv(newdatafile, index=False)
#else:
#    selected_customers = pd.read_csv(finaltraindata, encoding="ISO-8859-1",
#                                         dtype={'CustomerID': str, 'InvoiceID': str})

Saving final filtered data to a csv file data\data_aug.csv...


### 4. Model Building

### 4.1 Choose features and split data 

In [50]:
columns = ['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']
X = selected_customers[columns]
Y = selected_customers['cluster']

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, train_size=0.8)

INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU


In [51]:
print(X)

            mean    categ_0    categ_1    categ_2    categ_3    categ_4
0     558.172000  32.408290  10.442659   8.676179  33.948317  14.524555
1     449.310000   0.000000  38.016069   0.000000  61.983931   0.000000
2     334.400000   0.000000  11.692584   0.000000  60.406699  27.900718
3     345.663333  15.711338   0.491808  14.301006  66.125517   3.370331
4      89.000000   0.000000   0.000000  22.359551  57.752809  19.887640
...          ...        ...        ...        ...        ...        ...
3603  180.600000  24.833887   0.000000  34.025471   0.000000  41.140642
3604   80.820000   0.000000  18.708241   0.000000  62.360802  18.930958
3605   98.760000  13.669502  19.349939   0.000000  49.058323  17.922236
3606  108.683000   8.290165  17.204163   2.245061  37.352668  34.907943
3607  765.280000   1.960067   5.315701   4.442818  70.680013  17.601401

[3608 rows x 6 columns]


In [54]:
non_numeric_columns = X.select_dtypes(exclude=['int', 'float']).columns
print(non_numeric_columns)

Index([], dtype='object')


In [55]:
X = X.drop(columns=non_numeric_columns)

In [57]:
X = X.astype('float32')

### 4.2 Training and Inferencing

> Normal Sklearn package

`Data Preparation`

In [98]:
!python ./src/purchase-prediction-module.py -rtd data/data.csv -daf 20  

Reading raw data from csv file data/data.csv...


[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.fit: running accelerated version on CPU
INFO:root:sklearn.cluster.KMeans.predict: running accelerated version on CPU

Dataframe dimensions: (541909, 8)
Dataframe dimensions: (406829, 8)
Entrées dupliquées: 5225
Saving final filtered data to a csv file data\data_aug_20.csv...
====> Program exeuction time 2273.5686645507812 secs


`Training`

In [99]:
!python ./src/purchase-prediction-module.py -ftd data/data_aug_20.csv -t 0 -s 1 -alg knn

Running KNeighborsClassifier ...

[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



====>KNeighborsClassifier Averrage Training Time with best tuned hyper parameters 0.8802092084884644 secs
====> Program exeuction time 883.6747047901154 secs


`Hyperparameter tuning`

In [100]:
!python ./src/purchase-prediction-module.py -ftd data/data_aug_20.csv -t 1 -s 1 -alg knn

`Inference`

In [101]:
!python ./src/purchase-prediction-module.py -ftd data/data_aug_20.csv -s 1 -inf knn_model

[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Running KNeighborsClassifier ...
Fitting 5 folds for each of 49 candidates, totalling 245 fits
====> KNeighborsClassifier Training Time with hyperparameters 712.2488193511963 secs
Saving the model ...
KNeighborsClassifier model 'knn_model.joblib' is saved in: /model 
====> Program exeuction time 716.4553408622742 secs
[CV 2/5; 1/49] START n_neighbors=1..............................................
[CV 2/5; 1/49] END ...............n_neighbors=1;, score=1.000 total time=   7.2s
[CV 3/5; 2/49] START n_neighbors=2..............................................
[CV 3/5; 2/49] END ...............n_neighbors=2;, score=1.000 total time=   7.5s
[CV 2/5; 3/49] START n_neighbors=3..............................................
[CV 2/5; 3/49] END ...............n_neighbors=3;, score=1.000 total time=   6.4s
[CV 1/5; 4/49] START n_neighbors=4..............................................
[CV 1/5; 4/49] END ...............n_neighbors=4;, score=1.000 total time=   6.6s
[CV 5/5; 4/49] START n_neighbors

[CV 4/5; 21/49] END .............n_neighbors=21;, score=0.961 total time=  16.6s
[CV 3/5; 22/49] START n_neighbors=22............................................
[CV 3/5; 22/49] END .............n_neighbors=22;, score=0.950 total time=  12.7s
[CV 2/5; 23/49] START n_neighbors=23............................................
[CV 2/5; 23/49] END .............n_neighbors=23;, score=0.934 total time=  11.3s
[CV 1/5; 24/49] START n_neighbors=24............................................
[CV 1/5; 24/49] END .............n_neighbors=24;, score=0.923 total time=  10.3s
[CV 5/5; 24/49] START n_neighbors=24............................................
[CV 5/5; 24/49] END .............n_neighbors=24;, score=0.922 total time=  11.5s
[CV 4/5; 25/49] START n_neighbors=25............................................
[CV 4/5; 25/49] END .............n_neighbors=25;, score=0.911 total time=  10.7s
[CV 3/5; 26/49] START n_neighbors=26............................................
[CV 3/5; 26/49] END ........

Model loading ...

[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



kNN model loaded successfully
====> KNeighborsClassifier Model Average Inference Time is 26.700090253353117 secs
====> Accuracy for kNN is: 100.0 % 
====> F1 score for kNN is: 1.0
====> Program exeuction time 2682.1091253757477 secs


> Intel package

`Patching`

In [103]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


`Training`

In [104]:
!python ./src/purchase-prediction-module.py -ftd data/data_aug_20.csv -t 0 -s 0 -alg knn

Running KNeighborsClassifier ...
====>KNeighborsClassifier Averrage Training Time with best tuned hyper parameters 0.127608345746994 secs
====> Program exeuction time 136.29237365722656 secs


[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.K

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU


`Hyperameter Tuning`

In [105]:
!python ./src/purchase-prediction-module.py -ftd data/data_aug_20.csv -t 1 -s 0 -alg knn

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

`Inferencing`

In [106]:
!python ./src/purchase-prediction-module.py -ftd data/data_aug_20.csv -s 0 -inf knn_model

Running KNeighborsClassifier ...
Fitting 5 folds for each of 49 candidates, totalling 245 fits
====> KNeighborsClassifier Training Time with hyperparameters 130.1439392566681 secs
Saving the model ...
KNeighborsClassifier model 'knn_model.joblib' is saved in: /model 
====> Program exeuction time 133.1111261844635 secs
[CV 2/5; 1/49] START n_neighbors=1..............................................
[CV 2/5; 1/49] END ...............n_neighbors=1;, score=1.000 total time=   3.1s
[CV 1/5; 2/49] START n_neighbors=2..............................................
[CV 1/5; 2/49] END ...............n_neighbors=2;, score=1.000 total time=   0.6s
[CV 4/5; 2/49] START n_neighbors=2..............................................
[CV 4/5; 2/49] END ...............n_neighbors=2;, score=1.000 total time=   0.5s
[CV 3/5; 3/49] START n_neighbors=3..............................................
[CV 3/5; 3/49] END ...............n_neighbors=3;, score=1.000 total time=   0.4s
[CV 2/5; 4/49] START n_neighbors

[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU



[CV 1/5; 31/49] START n_neighbors=31............................................
[CV 1/5; 31/49] END .............n_neighbors=31;, score=0.893 total time=   1.2s
[CV 4/5; 31/49] START n_neighbors=31............................................
[CV 4/5; 31/49] END .............n_neighbors=31;, score=0.887 total time=   1.6s
[CV 3/5; 32/49] START n_neighbors=32............................................
[CV 3/5; 32/49] END .............n_neighbors=32;, score=0.896 total time=   1.7s
[CV 4/5; 33/49] START n_neighbors=33............................................
[CV 4/5; 33/49] END .............n_neighbors=33;, score=0.889 total time=   1.9s
[CV 2/5; 34/49] START n_neighbors=34............................................
[CV 2/5; 34/49] END .............n_neighbors=34;, score=0.895 total time=   2.1s
[CV 3/5; 35/49] START n_neighbors=35............................................
[CV 3/5; 35/49] END .............n_neighbors=35;, score=0.900 total time=   1.7s
[CV 2/5; 36/49] START n_nei

[nltk_data] Downloading package punkt to C:\Users\Iman
[nltk_data]     Ngwepe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.model_selection.train_test_split: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn


kNN model loaded successfully
====> KNeighborsClassifier Model Average Inference Time is 0.2603182101249695 secs
====> Accuracy for kNN is: 100.0 % 
====> F1 score for kNN is: 1.0
====> Program exeuction time 32.06372261047363 secs



INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsClassifier.predict: running acc

In [107]:
# unpatching
#sklearnex.unpatch_sklearn()

In [None]:
# Comparison with 20 fold data
Training time 
Sklearn: 0.8802092084884644 secs
Sklearnex: 0.127608345746994 secs
    
Inference time:
Sklearn: 26.700090253353117 secs
Sklearnex: 0.2603182101249695 secs