<h3 align='center'> FAI Group F2 Purchase Prediction </h3>

## 1. Data Collection 

### 1.1 Import necessary packages 

In [2]:
import os
import sys
import argparse
import time
import datetime
import warnings
import logging
import numpy as np
import pandas as pd
import nltk
import joblib
import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

warnings.filterwarnings("ignore")

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

### 1.2 Read the data 

In [3]:
data=pd.read_csv('data\data.csv',encoding='unicode-escape')
print(data.shape)
data.sample(5)

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
495459,578305,22784,LANTERN CREAM GAZEBO,18,11/23/2011 15:44,4.25,14088.0,United Kingdom
205785,554837,22784,LANTERN CREAM GAZEBO,2,5/26/2011 16:30,4.95,14584.0,United Kingdom
330472,565919,22117,METAL SIGN HER DINNER IS SERVED,2,9/7/2011 16:48,5.79,,United Kingdom
86389,543546,22268,EASTER DECORATION SITTING BUNNY,1,2/9/2011 16:52,1.63,,United Kingdom
214074,555564,82581,TOILET METAL SIGN,3,6/5/2011 15:01,0.55,15005.0,United Kingdom


In [4]:
# More information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
data['InvoiceNo'].unique(), data['StockCode'].unique()

(array(['536365', '536366', '536367', ..., '581585', '581586', '581587'],
       dtype=object),
 array(['85123A', '71053', '84406B', ..., '90214U', '47591b', '23843'],
       dtype=object))

- The Online Retail file has 541909 entries and 8 columns. There 4 object datatypes,`InvoiceNo`, `StockCode`, `Description` and `Country`, 1 interger `Quantity` and 2 floats `UnitPrice` and `CustomerID`. There are missing values in the `CustomerID` column.

### 2. Data Wrangling and Cleaning

In [6]:
# Declaring the path where the Models will be saved and loaded from
rfc_model = 'model/rfc_model.joblib'
knn_model = 'model/knn_model.joblib'
dtc_model = 'model/dtc_model.joblib'

In [7]:
# Hyperparamter tuning, saving & loading of Machine Learning models
def hyperparameter_tuning(algorithm, param_grid, kFold):
    "Hyper parameter tuning for the given algorithms"
    gcv = GridSearchCV(algorithm, param_grid=parameters, cv=kFold, verbose=10, n_jobs=-1)
    model = gcv.fit(X_train,Y_train)
    estimator = gcv.best_estimator_
    return estimator, model

def save_model(estimator, modelname):
    print("Saving the model ...")
    try:
        dump(estimator, modelname)
    except Exception as e:
        # Exception handler, alert the user
        raise IOError("Error saving model data to disk: {}".format(str(e))) from e

def load_model(modelname):
    try:
        print("Model loading ...")
        loaded_model = load(modelname)
    except Exception as e:
        raise IOError("Error loading model data from disk: {}".format(str(e))) from e
    return loaded_model

def iter_minibatches(chunksize):
    # Provide chunks one by one
    chunkstartmarker = 0
    #while chunkstartmarker < numtrainingpoints:
    while chunkstartmarker < len(X_train):
        #chunkrows = range(chunkstartmarker,chunkstartmarker+chunksize)
        chunkrows = chunkstartmarker+chunksize
        X_chunk, Y_chunk = X_train[chunkstartmarker:chunkrows], Y_train[chunkstartmarker:chunkrows]
        yield X_chunk, Y_chunk
        chunkstartmarker += chunksize

In [8]:
## helper class definition ???
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-rtd',
                        '--raw_train_data',
                        type=str,
                        required=False,
                        default='data/data.csv',
                        help='raw data csv file, if this parameter is specified then it will only perform the data preparation part')
    parser.add_argument('-daf',
                        '--data_aug_factor',
                        type=int,
                        required=False,
                        default='0',
                        help='data augmentation/multiplication factor, requires --raw-train-data parameter')
    parser.add_argument('-ftd',
                        '--final_train_data',
                        type=str,
                        required=False,
                        default='0',
                        help='final filtered data csv file, if this parameter is specified then it will skip the data preparation part')
    parser.add_argument('-t',
                        '--tuning',
                        type=str,
                        required=False,
                        default='0',
                        help='hyper parameter tuning (0/1). Along with Hyperparamter tuning, the model is saved ')
    parser.add_argument('-s',
                        '--stock',
                        type=str,
                        required=False,
                        default='0',
                        help='Use Stock Python (0/1)')
    parser.add_argument('-alg',
                        '--algorithm',
                        type=str,
                        required=False,
                        default='knn',
                        help='scikit learn classifier algorithm to be used (knn,dtc,rfc) \
                        - knn=KNearestNeighborClassifier, dtc=DecisionTreeClassifier, rfc=RandomForestClassifier')
    parser.add_argument('-b',
                        '--batch_size',
                        type=int,
                        required=False,
                        default=None,
                        help='hyper parameter tuning (0/1). Along with Hyperparamter tuning, the model is saved ')
    parser.add_argument('-inf',
                        '--inference',
                        type=str,
                        required=False,
                        default='0',
                        help='Perform Inference on the saved models.Specify the model file with path i.e knn_model or rfc_model or dtc_model')

In [9]:
# variables definitions ???
FLAGS = parser.parse_args()
    # pkg = FLAGS.package
finaltraindata = FLAGS.final_train_data
dataaugfactor = FLAGS.data_aug_factor
rawtraindata = FLAGS.raw_train_data
algorithm = FLAGS.algorithm
inference = FLAGS.inference
tuning = True if FLAGS.tuning == '1' else False
stock = True if FLAGS.stock == '1' else False
batch_size = FLAGS.batch_size

prgstime = time.time()

In [10]:
# import sklearn and intel packages

if stock is False:
    from sklearnex import patch_sklearn  # pylint: disable=import-error
    patch_sklearn()
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score
    from sklearn import model_selection
    from sklearn import metrics
    from sklearn.model_selection import GridSearchCV
    from sklearn import neighbors
    from sklearn import linear_model

    from sklearn import tree
    from sklearn import ensemble
    from sklearn.decomposition import PCA
    from joblib import load, dump

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [12]:
# read the csv file

df_initial = pd.read_csv('data/data.csv', encoding="ISO-8859-1",
                                     dtype={'CustomerID': str, 'InvoiceID': str})
    
print('Dataframe dimensions:', df_initial.shape)    
    

Dataframe dimensions: (541909, 8)


In [13]:
# check for null values

df_initial['InvoiceDate'] = pd.to_datetime(df_initial['InvoiceDate'])
        # ____________________________________________________________
        # gives some infos on columns types and numer of null values
tab_info = pd.DataFrame(df_initial.dtypes).T.rename(index={0: 'column type'})
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0: 'null values (nb)'}))
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum() / df_initial.shape[0] * 100).T.
                                   rename(index={0: 'null values (%)'}))
df_initial.dropna(axis=0, subset=['CustomerID'], inplace=True)
print('Dataframe dimensions:', df_initial.shape)

Dataframe dimensions: (406829, 8)


In [14]:
# gives some information on columns types and number of null values
tab_info = pd.DataFrame(df_initial.dtypes).T.rename(index={0: 'column type'})
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0: 'null values (nb)'}))
tab_info = tab_info.append(pd.DataFrame(df_initial.isnull().sum() / df_initial.shape[0] * 100).T.
                                   rename(index={0: 'null values (%)'}))
print('Duplicate entries: {}'.format(df_initial.duplicated().sum()))
df_initial.drop_duplicates(inplace=True)

Duplicate entries: 5225


In [15]:
temp = df_initial[['CustomerID', 'InvoiceNo', 'Country']].groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
temp = temp.reset_index(drop=False)
countries = temp['Country'].value_counts()
countries

United Kingdom          19857
Germany                   603
France                    458
EIRE                      319
Belgium                   119
Spain                     105
Netherlands               101
Switzerland                71
Portugal                   70
Australia                  69
Italy                      55
Finland                    48
Sweden                     46
Norway                     40
Channel Islands            33
Japan                      28
Poland                     24
Denmark                    21
Cyprus                     20
Austria                    19
Singapore                  10
Malta                      10
Unspecified                 8
USA                         7
Iceland                     7
Israel                      6
Canada                      6
Greece                      6
Czech Republic              5
European Community          5
Lithuania                   4
United Arab Emirates        3
Saudi Arabia                2
Bahrain   

- In the data, there are 36 countries and 1 unspecified entry. Most of customers are from the UK.

### 2.1 Feature Engineering

In [16]:
# make temporary df showing CustomerID, InvoiceNo, and InvoiceDate by count
pd.DataFrame([{'products': len(df_initial['StockCode'].value_counts()),
                       'transactions': len(df_initial['InvoiceNo'].value_counts()),
                       'customers': len(df_initial['CustomerID'].value_counts()),
                       }], columns=['products', 'transactions', 'customers'], index=['quantity']
                     )
temp = df_initial.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()

temp[:5] # first 5 entries of temp df

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate
0,12346,541431,1
1,12346,C541433,1
2,12347,537626,31
3,12347,542237,29
4,12347,549222,24


In [17]:
# from the invoice date get the number of cancelled orders
nb_products_per_basket = temp.rename(columns={'InvoiceDate': 'Number of products'})
nb_products_per_basket[:10].sort_values('CustomerID')

nb_products_per_basket['order_canceled'] = nb_products_per_basket['InvoiceNo'].apply(lambda x: int('C' in x))

n1 = nb_products_per_basket['order_canceled'].sum()
n2 = nb_products_per_basket.shape[0]

print(n1, n2) # number of orders cancelled vs ordered in the basket

3654 22190


In [18]:
# make df_check to check the quantity that was less than 0
df_check = df_initial[df_initial['Quantity'] < 0][['CustomerID', 'Quantity',
                                                           'StockCode', 'Description', 'UnitPrice']]
for index, col in df_check.iterrows():
    if df_initial[(df_initial['CustomerID'] == col[0]) & (df_initial['Quantity'] == -col[1]) &
                          (df_initial['Description'] == col[2])].shape[0] == 0:
        break

df_check = df_initial[(df_initial['Quantity'] < 0) & (df_initial['Description'] != 'Discount')][
                                         ['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]

df_check[:5] # 1st five

Unnamed: 0,CustomerID,Quantity,StockCode,Description,UnitPrice
154,15311,-1,35004C,SET OF 3 COLOURED FLYING DUCKS,4.65
235,17548,-12,22556,PLASTERS IN TIN CIRCUS PARADE,1.65
236,17548,-24,21984,PACK OF 12 PINK PAISLEY TISSUES,0.29
237,17548,-24,21983,PACK OF 12 BLUE PAISLEY TISSUES,0.29
238,17548,-24,21980,PACK OF 12 RED RETROSPOT TISSUES,0.29


In [19]:
# clean the df by introducing a new column of cancelled orders and give them a value of 0
for index, col in df_check.iterrows():
    if df_initial[(df_initial['CustomerID'] == col[0]) & (df_initial['Quantity'] == -col[1]) &
                          (df_initial['Description'] == col[2])].shape[0] == 0:
        break

df_cleaned = df_initial.copy(deep=True)
df_cleaned['QuantityCanceled'] = 0

df_cleaned[:5]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,QuantityCanceled
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0


In [None]:
# clean the df by removing doubtful entries

entry_to_remove = []
doubtful_entry = []

for index, col in df_initial.iterrows():
    if (col['Quantity'] > 0) or col['Description'] == 'Discount':
        continue
    df_test = df_initial[(df_initial['CustomerID'] == col['CustomerID'])
                         & (df_initial['StockCode'] == col['StockCode'])
                         & (df_initial['InvoiceDate'] < col['InvoiceDate'])
                         & (df_initial['Quantity'] > 0)].copy()
            # _________________________________
            # Cancelation WITHOUT counterpart
    if df_test.shape[0] == 0:
        doubtful_entry.append(index)
            # ________________________________
            # Cancelation WITH a counterpart
    elif df_test.shape[0] == 1:
        index_order = df_test.index[0]
        df_cleaned.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
        entry_to_remove.append(index)
            # ______________________________________________________________
            # Various counterparts exist in orders: we delete the last one
    elif df_test.shape[0] > 1:
        df_test.sort_index(axis=0, ascending=False, inplace=True)
        for ind, val in df_test.iterrows():
            if val['Quantity'] < -col['Quantity']:
                continue
            df_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
            entry_to_remove.append(index)
            break

df_cleaned.drop(entry_to_remove, axis=0, inplace=True)
df_cleaned.drop(doubtful_entry, axis=0, inplace=True)
remaining_entries = df_cleaned[(df_cleaned['Quantity'] < 0) & (df_cleaned['StockCode'] != 'D')]

remaining_entries[:5] # 1st five remaining


In [None]:
# introduce the total price by calculation, multiply UnitPrice with Quantity and subtract the QuantityCanceled

# df_cleaned[(df_cleaned['CustomerID'] == 14048) & (df_cleaned['StockCode'] == '22464')]
list_special_codes = df_cleaned[df_cleaned['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
df_cleaned['TotalPrice'] = df_cleaned['UnitPrice'] * (df_cleaned['Quantity'] - df_cleaned['QuantityCanceled'])

df_cleaned.sort_values('CustomerID')[:5] # cleaned sorted values

In [None]:
# Rename TotalPrice to BasketPrice and choose Prices above 0

# somme des achats / utilisateur & commande
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
basket_price = temp.rename(columns={'TotalPrice': 'Basket Price'})
        # date de la commande
df_cleaned['InvoiceDate_int'] = df_cleaned['InvoiceDate'].astype('int64')
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_cleaned.drop('InvoiceDate_int', axis=1, inplace=True)
basket_price.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
        # selection des entrées significatives:
basket_price = basket_price[basket_price['Basket Price'] > 0]

basket_price.sort_values('CustomerID')[:5] # basket price sorted values


In [None]:
# Define price ranges to work with from the BasketPrice, initialize empty count_price list and append the prices to the list

# Décompte des achats
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
    if i == 0:
        continue
    val = basket_price[(basket_price['Basket Price'] < price) & (basket_price['Basket Price'] > price_range[i - 1])]['Basket Price'].count()
    count_price.append(val)

### 2.3 NLP

In [None]:
# make function to check for nouns
def is_noun(pos):
    '''Noun validation'''
    return pos[:2] == 'NN'

        # is_noun = lambda pos: pos[:2] == 'NN'

In [None]:
# make a keywords inventory function
def keywords_inventory(dataframe, colonne='Description'):
    '''Stemming'''
    stemmer = nltk.stem.SnowballStemmer("english")
    keywords_roots = dict()  # collect the words / root
    keywords_select = dict()  # association: root <-> keyword
    category_keys = []
    count_keywords = dict()
            # icount = 0
    for s in dataframe[colonne]:
        if pd.isnull(s):
            continue
        lines = s.lower()
        tokenized = nltk.word_tokenize(lines)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]

        for t in nouns:
            t = t.lower()
            racine = stemmer.stem(t)
            if racine in keywords_roots:
                keywords_roots[racine].add(t)
                count_keywords[racine] += 1
            else:
                keywords_roots[racine] = {t}
                count_keywords[racine] = 1

    for s in keywords_roots.keys():
        if len(keywords_roots[s]) > 1:
            min_length = 1000
            for k in keywords_roots[s]:
                if len(k) < min_length:
                    clef = k
                    min_length = len(k)
            category_keys.append(clef)
            keywords_select[s] = clef
        else:
            category_keys.append(list(keywords_roots[s])[0])
            keywords_select[s] = list(keywords_roots[s])[0]
            
    return category_keys, keywords_roots, keywords_select, count_keywords


In [None]:
# make a df isolating only the product description
df_produits = pd.DataFrame(df_initial['Description'].unique()).rename(columns={0: 'Description'})

keywords, keywords_roots, keywords_select, count_keywords = keywords_inventory(df_produits)

df_produits[:2] # 1st 2 products

In [None]:
# intialize empty list_products list and append keywords from Description to it

list_products = []

for k, v in count_keywords.items():
    list_products.append([keywords_select[k], v])
list_products.sort(key=lambda x: x[1], reverse=True)

liste = sorted(list_products, key=lambda x: x[1], reverse=True)

list_products = []
for k, v in count_keywords.items():
    word = keywords_select[k]
    if word in ['pink', 'blue', 'tag', 'green', 'orange']:
        continue
    if len(word) < 3 or v < 13:
        continue
    if ('+' in word) or ('/' in word):
        continue
    list_products.append([word, v])
        # ______________________________________________________
list_products.sort(key=lambda x: x[1], reverse=True)

liste_produits = df_cleaned['Description'].unique()

liste_produits[:2] # 1st 2

In [None]:
list_products[:3] # list of products

In [None]:
# make 

X = pd.DataFrame()
for key, occurence in list_products:
    X.loc[:, key] = list(map(lambda x: int(key.upper() in x), liste_produits))

threshold = [0, 1, 2, 3, 5, 10]
label_col = []
for i in range(len(threshold)):
    if i == len(threshold) - 1:
        col = '.>{}'.format(threshold[i])
    else:
        col = '{}<.<{}'.format(threshold[i], threshold[i + 1])
    label_col.append(col)
    X.loc[:, col] = 0

for i, prod in enumerate(liste_produits):
    prix = df_cleaned[df_cleaned['Description'] == prod]['UnitPrice'].mean()
    j = 0
    while prix > threshold[j]:
        j = j + 1
        if j == len(threshold):
            break
    X.loc[i, label_col[j - 1]] = 1

for i in range(len(threshold)):
    if i == len(threshold) - 1:
        col = '.>{}'.format(threshold[i])
    else:
        col = '{}<.<{}'.format(threshold[i], threshold[i + 1])


### 3. Preprocessing

In [None]:
# transform X to a matrix and use KMeans to make 5 clusters

matrix = X.to_numpy()  # as_matrix()
for n_clusters in range(3, 10):
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=30)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)

n_clusters = 5
silhouette_avg = -1
while silhouette_avg < 0.145:
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=30)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)

k_value_c = pd.Series(clusters).value_counts()    
#pd.Series(clusters).value_counts()

k_value_c ## look at the series

In [None]:
# define individual silouhette scores
sample_silhouette_values = silhouette_samples(matrix, clusters)

liste = pd.DataFrame(liste_produits)
liste_words = [word for (word, occurence) in list_products]

occurence = [dict() for _ in range(n_clusters)]

for i in range(n_clusters):
    liste_cluster = liste.loc[clusters == i]
    for word in liste_words:
        if word in ['art', 'set', 'heart', 'pink', 'blue', 'tag']:
            continue
        occurence[i][word] = sum(liste_cluster.loc[:, 0].str.contains(word.upper()))

In [None]:
# use PCA to reduce dimensions
pca = PCA()
pca.fit(matrix)
pca_samples = pca.transform(matrix)

pca_samples

In [None]:
pca = PCA(n_components=50)
matrix_9D = pca.fit_transform(matrix)
mat = pd.DataFrame(matrix_9D)
mat['cluster'] = pd.Series(clusters)

mat['cluster'][:10]##1st 10

In [None]:
# from the 5 clusters create 5 categories, from the main category 
corresp = dict()
for key, val in zip(liste_produits, clusters):
    corresp[key] = val
        # __________________________________________________________________
df_cleaned['categ_product'] = df_cleaned.loc[:, 'Description'].map(corresp)

for i in range(5):
    col = 'categ_{}'.format(i)
    df_temp = df_cleaned[df_cleaned['categ_product'] == i]
    price_temp = df_temp['UnitPrice'] * (df_temp['Quantity'] - df_temp['QuantityCanceled'])
    price_temp = price_temp.apply(lambda x: x if x > 0 else 0)
    df_cleaned.loc[:, col] = price_temp
    df_cleaned[col].fillna(0, inplace=True)

In [None]:
df_cleaned[['InvoiceNo', 'Description', 'categ_product', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']][:5] ### look at these columns

In [None]:
#

# somme des achats / utilisateur & commande
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
basket_price = temp.rename(columns={'TotalPrice': 'Basket Price'})
        # ___________________________________________________________
        # pourcentage du prix de la commande / categorie de produit
for i in range(5):
    col = 'categ_{}'.format(i)
    temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)[col].sum()
    basket_price.loc[:, col] = temp[col]

In [None]:
# merge the categories with the customerID df

# date de la commande
df_cleaned['InvoiceDate_int'] = df_cleaned['InvoiceDate'].astype('int64')
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_cleaned.drop('InvoiceDate_int', axis=1, inplace=True)
basket_price.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
        # ______________________________________
        # selection des entrées significatives:
basket_price = basket_price[basket_price['Basket Price'] > 0]

basket_price.sort_values('CustomerID', ascending=True)[:5] ## basket price sorted

In [None]:
# make a new df, consisting of customer, transations(aggregated) and the categories. To see the customer spent habit

set_entrainement = basket_price[basket_price['InvoiceDate'] < pd.to_datetime(datetime.date(2011, 10, 1))]
set_test = basket_price[basket_price['InvoiceDate'] >= pd.to_datetime(datetime.date(2011, 10, 1))]
basket_price = set_entrainement.copy(deep=True)

        # nb de visites et stats sur le montant du panier / utilisateurs
transactions_per_user = basket_price.groupby(by=['CustomerID'])['Basket Price'].agg(['count', 'min', 'max', 'mean', 'sum'])
for i in range(5):
    col = 'categ_{}'.format(i)
    transactions_per_user.loc[:, col] = basket_price.groupby(by=['CustomerID'])[col].sum() / transactions_per_user['sum'] * 100

transactions_per_user.reset_index(drop=False, inplace=True)
basket_price.groupby(by=['CustomerID'])['categ_0'].sum()

transactions_per_user.sort_values('CustomerID', ascending=True)[:5] ### sorted user transactions

In [None]:
# introduce the date, to the above df 
last_date = basket_price['InvoiceDate'].max().date()

first_registration = pd.DataFrame(basket_price.groupby(by=['CustomerID'])['InvoiceDate'].min())
last_purchase = pd.DataFrame(basket_price.groupby(by=['CustomerID'])['InvoiceDate'].max())

test = first_registration.applymap(lambda x: (last_date - x.date()).days)
test2 = last_purchase.applymap(lambda x: (last_date - x.date()).days)

transactions_per_user.loc[:, 'LastPurchase'] = test2.reset_index(drop=False)['InvoiceDate']
transactions_per_user.loc[:, 'FirstPurchase'] = test.reset_index(drop=False)['InvoiceDate']

transactions_per_user[:5] ### user transactions

In [None]:
# see the number of transations per user 
n1 = transactions_per_user[transactions_per_user['count'] == 1].shape[0]
n2 = transactions_per_user.shape[0]

print(n1, n2) # see transactions

In [None]:
# make a list with the countsand categories and transform it to a matrix
list_cols = ['count', 'min', 'max', 'mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']
        # ________________________________________________________
selected_customers = transactions_per_user.copy(deep=True)
matrix = selected_customers[list_cols].to_numpy()

matrix[:2] ## look 

- The values in the matrix have varying scales.

In [None]:
# Normalize

scaler = StandardScaler()
scaler.fit(matrix)
scaled_matrix = scaler.transform(matrix)

In [None]:
# Reduce dimensions/size

pca = PCA()
pca.fit(scaled_matrix)
pca_samples = pca.transform(scaled_matrix)

In [None]:
# define clusters

n_clusters = 11
kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100)
kmeans.fit(scaled_matrix)
clusters_clients = kmeans.predict(scaled_matrix)
silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)

pd.DataFrame(pd.Series(clusters_clients).value_counts(), columns=['nb. of clients']).T ## the df


- the number of clients in each cluster.

In [None]:
# use 6 k
pca = PCA(n_components=6)
matrix_3D = pca.fit_transform(scaled_matrix)
mat = pd.DataFrame(matrix_3D)
mat['cluster'] = pd.Series(clusters_clients)

mat['cluster'][:10] ### 1st 10 groups

In [None]:
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
        # define individual silouhette scores
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)

selected_customers.loc[:, 'cluster'] = clusters_clients

In [None]:
# merged df customers by cluster

merged_df = pd.DataFrame()
for i in range(n_clusters):
    test = pd.DataFrame(selected_customers[selected_customers['cluster'] == i].mean())
    test = test.T.set_index('cluster', drop=True)
    test['size'] = selected_customers[selected_customers['cluster'] == i].shape[0]
    merged_df = pd.concat([merged_df, test])
        # _____________________________________________________
merged_df.drop('CustomerID', axis=1, inplace=True)

merged_df = merged_df.sort_values('sum')

merged_df 

In [None]:

liste_index = []
for i in range(5):
    COLUMN = f'categ_{i}'
    liste_index.append(merged_df[merged_df[COLUMN] > 45].index.values[0])
        # ___________________________________
liste_index_reordered = liste_index
liste_index_reordered += [s for s in merged_df.index if s not in liste_index]
        # __________________________________________________________
merged_df = merged_df.reindex(index=liste_index_reordered)
merged_df = merged_df.reset_index(drop=False)

if dataaugfactor != 0:
    selected_customers = pd.concat([selected_customers] * dataaugfactor, ignore_index=True)
else:
    selected_customers = pd.concat([selected_customers], ignore_index=True)

In [None]:
 # Save to a csv file
p, file = os.path.split(rawtraindata)
split_tup = os.path.splitext(file)
file_name = split_tup[0]
file_extension = split_tup[1]

if dataaugfactor > 0:
    SUFFIX = f'_aug_{dataaugfactor}'
else:
    SUFFIX = '_aug'
newdatafile = os.path.join(p, file_name + SUFFIX + file_extension)
print(f'Saving final filtered data to a csv file {newdatafile}...')
selected_customers.to_csv(newdatafile, index=False)
#else:
#    selected_customers = pd.read_csv(finaltraindata, encoding="ISO-8859-1",
#                                         dtype={'CustomerID': str, 'InvoiceID': str})

### 4. Model Building

### 4.1 Choose features and split data 

In [None]:
columns = ['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']
X = selected_customers[columns]
Y = selected_customers['cluster']

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, train_size=0.8)

In [None]:
print(X)

### 4.2. Iterate through different models

In [51]:
if algorithm == 'knn':
                # '''Algorithm = KNeighborsClassifier'''
        print('Running KNeighborsClassifier ...')
                #knn = ClassFit(clf=neighbors.KNeighborsClassifier)
        knn = neighbors.KNeighborsClassifier(n_jobs=-1)
    
        if tuning is True:
            stime = time.time()
            parameters={'n_neighbors' : np.arange(1, 50, 1)}
            estimator, model  = hyperparameter_tuning(algorithm=knn, param_grid=parameters , kFold=5)
            print(f'====> KNeighborsClassifier Training Time with hyperparameters {time.time()-stime} secs')
            save_model(estimator, modelname=knn_model) # model is saved & loaded using joblib
            print("KNeighborsClassifier model 'knn_model.joblib' is saved in: /model ")

        else:
            if batch_size is None:
                tuned_params = {'n_neighbors': 1}
                tuned_model = knn
                tuned_model.set_params(**tuned_params)
                total_time = 0
                for i in range(1000):
                    stime = time.time()
                    tuned_model.fit(X_train, Y_train)
                    total_time += time.time()-stime
                
                print(f'====>KNeighborsClassifier Average Training Time with best tuned hyper parameters {total_time/1000} secs')


Running KNeighborsClassifier ...


INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

====>KNeighborsClassifier Average Training Time with best tuned hyper parameters 0.035468908548355105 secs


In [55]:
if tuning is not True and batch_size is not None:
    batchiterator = iter_minibatches(chunksize=batch_size)
                    # Tuned hyper parameter training for KNN
    tuned_params = {'n_neighbors': 1}
    tuned_model = neighbors.KNeighborsClassifier(n_jobs = -1)
    tuned_model.set_params(**tuned_params)
    total_time = 0
    total_batches = 0
    for X_batch, Y_batch in batchiterator:
        stime = time.time()
        tuned_model.fit(X_batch, Y_batch)
        total_time += time.time() - stime
        total_batches += 1

    print(f'====> KNeighborsClassifier Training Time with batch size {batch_size} is {total_time} secs')
    print(f'====>Average Training Time for {total_batches} batches is {total_time/total_batches} secs')

In [57]:
if algorithm == 'dtc': 
    print('Running DecisionTreeClassifier ...')
    dtc = tree.DecisionTreeClassifier()
    
    if tuning is True:
        stime = time.time()
        parameters={'criterion': ['entropy', 'gini'], 'max_features': ['sqrt', 'log2']}
        estimator, model = hyperparameter_tuning(algorithm=dtc, param_grid=parameters , kFold=5)
        print(f'====> DecisionTreeClassifier Training Time with hyperparameters {time.time()-stime} secs')
        save_model(estimator, modelname=dtc_model) # model is saved & loaded using joblib
        print("DecisionTreeClassifier model 'dtc_model.joblib'is saved in: /model ")
    else:
        stime = time.time()
        dtc.fit(X_train, Y_train)
        print(f'====> DecisionTreeClassifier Training Time is {time.time()-stime} secs')

In [58]:
if algorithm == 'rfc':
                # '''Algorithm = RandomForestClassifier'''
    print('Running RandomForestClassifier ...')
    rfc = ensemble.RandomForestClassifier(n_jobs=-1)
    
    if tuning is True:
                    #parameters = {'criterion': ['entropy', 'gini'], 'n_estimators': [20, 40, 60, 80, 100],
        parameters = {'criterion': ['gini'],
                                  'n_estimators': [20, 40, 60, 80, 100],
                                  'max_features': ['sqrt', 'log2']}
        stime = time.time()
        estimator, model = hyperparameter_tuning(algorithm=rfc, param_grid=parameters, kFold=5)
        print(f'====> RandomForestClassifier Training Time with hyperparameters {time.time()-stime} secs')
        save_model(estimator, modelname=rfc_model)  # model is saved & loaded using joblib
        print("====> RandomForestClassifier model 'rfc_model.joblib'is saved in: /model ")

    else:
        if batch_size is None:
                        # Tuned hyper parameter training for RFC
            tuned_params = {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100}
            tuned_model_rf = rfc
            tuned_model_rf.set_params(**tuned_params)
            stime = time.time()
            tuned_model_rf.fit(X_train, Y_train)
            print(f'====> RandomForestClassifier Training Time with best tuned hyper parameters {time.time()-stime} secs')
                        #print(f'====> RandomForestClassifier Training Time {time.time()-stime} secs')

In [59]:
if tuning is not True and batch_size is not None:

    batchiterator = iter_minibatches(chunksize=batch_size)
                    # Tuned hyper parameter training for RFC
    tuned_params = {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100}
    tuned_model_rf = ensemble.RandomForestClassifier(n_jobs = -1)
    tuned_model_rf.set_params(**tuned_params)
    total_time = 0
    total_batches = 0
    for X_batch, Y_batch in batchiterator:
        print ("Length X_batch", len(X_batch))
        stime = time.time()
        tuned_model_rf.fit(X_batch, Y_batch)
        total_time += time.time() - stime
        print ("total time is ", total_time)
        total_batches += 1

    print(f'====>RandomForestClassifier Training Time with batch size {batch_size} is {total_time} secs')
    print(f'====>Average Training Time for {total_batches} batches is {total_time/total_batches} secs')

In [63]:
if inference == '0':
    if inference == 'knn_model':
        X_test = X
        Y_test = Y
        loaded_model = load_model(knn_model)
        print("kNN model loaded successfully")
        total_time = 0
        for i in range(100):
            stime = time.time()
            predictions = loaded_model.predict(X_test)
            total_time += time.time() - stime
        print(f'====> KNeighborsClassifier Model Average Inference Time is {total_time/100} secs')
        print(f"====> Accuracy for kNN is: {100 * metrics.accuracy_score(Y_test, predictions)} % ")
        print(f"====> F1 score for kNN is: { metrics.f1_score(Y_test,predictions,average = 'micro')}")
                
    elif inference == 'dtc_model':
        X_test = X
        Y_test = Y
        loaded_model = load_model(dtc_model)
        print("Decision Tree Classifier model loaded successfully")
        stime = time.time()
        predictions = loaded_model.predict(X_test)
        print(f'====> Decision Tree Classifier Model Inference Time is {time.time()-stime} secs')
        print(f"====> Accuracy for DTC is: {100 * metrics.accuracy_score(Y_test, predictions)} % ")
        print(f"====> F1 score for DTC is: { metrics.f1_score(Y_test,predictions,average = 'micro')}")
            
    elif inference == 'rfc_model':
        X_test = X
        Y_test = Y
        loaded_model= load_model(rfc_model)
        print("Random Forest Classifier model loaded successfully")
        stime = time.time()
        predictions = loaded_model.predict(X_test)
        print(f'====> Random Forest Classifier Model Inference Time is {time.time()-stime} secs')
        print(f"====> Accuracy for RFC is: {100 * metrics.accuracy_score(Y_test, predictions)} % ")
        print(f"====> F1 score for RFC is: { metrics.f1_score(Y_test,predictions,average = 'micro')}")
    else:
        print("====> Please check whether the correct model file name is passed or not!")
               
print(f'====> Program exeuction time {time.time()-prgstime} secs')

====> Please check whether the correct model file name is passed or not!
====> Program exeuction time 3198.0039706230164 secs


In [64]:
if inference == '0':
            # Regular Training / hyperparamter tuning & model saving
    if algorithm == 'knn':
                # '''Algorithm = KNeighborsClassifier'''
        print('Running KNeighborsClassifier ...')
                #knn = ClassFit(clf=neighbors.KNeighborsClassifier)
        knn = neighbors.KNeighborsClassifier(n_jobs=-1)
    
        if tuning is True:
            stime = time.time()
            parameters={'n_neighbors' : np.arange(1, 50, 1)}
            estimator, model  = hyperparameter_tuning(algorithm=knn, param_grid=parameters , kFold=5)
            print(f'====> KNeighborsClassifier Training Time with hyperparameters {time.time()-stime} secs')
            save_model(estimator, modelname=knn_model) # model is saved & loaded using joblib
            print("KNeighborsClassifier model 'knn_model.joblib' is saved in: /model ")

        else:
            if batch_size is None:
                tuned_params = {'n_neighbors': 1}
                tuned_model = knn
                tuned_model.set_params(**tuned_params)
                total_time = 0
                for i in range(1000):
                    stime = time.time()
                    tuned_model.fit(X_train, Y_train)
                    total_time += time.time()-stime
                
                print(f'====>KNeighborsClassifier Average Training Time with best tuned hyper parameters {total_time/1000} secs')

        if tuning is not True and batch_size is not None:

            batchiterator = iter_minibatches(chunksize=batch_size)
                    # Tuned hyper parameter training for KNN
            tuned_params = {'n_neighbors': 1}
            tuned_model = neighbors.KNeighborsClassifier(n_jobs = -1)
            tuned_model.set_params(**tuned_params)
            total_time = 0
            total_batches = 0
            for X_batch, Y_batch in batchiterator:
                stime = time.time()
                tuned_model.fit(X_batch, Y_batch)
                total_time += time.time() - stime
                total_batches += 1

            print(f'====> KNeighborsClassifier Training Time with batch size {batch_size} is {total_time} secs')
            print(f'====>Average Training Time for {total_batches} batches is {total_time/total_batches} secs')
    
    elif algorithm == 'dtc':
                # '''Algorithm = DecisionTreeClassifier'''
        print('Running DecisionTreeClassifier ...')
        dtc = tree.DecisionTreeClassifier()
    
        if tuning is True:
            stime = time.time()
            parameters={'criterion': ['entropy', 'gini'], 'max_features': ['sqrt', 'log2']}
            estimator, model = hyperparameter_tuning(algorithm=dtc, param_grid=parameters , kFold=5)
            print(f'====> DecisionTreeClassifier Training Time with hyperparameters {time.time()-stime} secs')
            save_model(estimator, modelname=dtc_model) # model is saved & loaded using joblib
            print("DecisionTreeClassifier model 'dtc_model.joblib'is saved in: /model ")
        else:
            stime = time.time()
            dtc.fit(X_train, Y_train)
            print(f'====> DecisionTreeClassifier Training Time is {time.time()-stime} secs')
    
    elif algorithm == 'rfc':
                # '''Algorithm = RandomForestClassifier'''
        print('Running RandomForestClassifier ...')
        rfc = ensemble.RandomForestClassifier(n_jobs=-1)
    
        if tuning is True:
                    #parameters = {'criterion': ['entropy', 'gini'], 'n_estimators': [20, 40, 60, 80, 100],
            parameters = {'criterion': ['gini'],
                                  'n_estimators': [20, 40, 60, 80, 100],
                                  'max_features': ['sqrt', 'log2']}
            stime = time.time()
            estimator, model = hyperparameter_tuning(algorithm=rfc, param_grid=parameters, kFold=5)
            print(f'====> RandomForestClassifier Training Time with hyperparameters {time.time()-stime} secs')
            save_model(estimator, modelname=rfc_model)  # model is saved & loaded using joblib
            print("====> RandomForestClassifier model 'rfc_model.joblib'is saved in: /model ")

        else:
            if batch_size is None:
                        # Tuned hyper parameter training for RFC
                tuned_params = {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100}
                tuned_model_rf = rfc
                tuned_model_rf.set_params(**tuned_params)
                stime = time.time()
                tuned_model_rf.fit(X_train, Y_train)
                print(f'====> RandomForestClassifier Training Time with best tuned hyper parameters {time.time()-stime} secs')
                        #print(f'====> RandomForestClassifier Training Time {time.time()-stime} secs')

        if tuning is not True and batch_size is not None:

            batchiterator = iter_minibatches(chunksize=batch_size)
                    # Tuned hyper parameter training for RFC
            tuned_params = {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100}
            tuned_model_rf = ensemble.RandomForestClassifier(n_jobs = -1)
            tuned_model_rf.set_params(**tuned_params)
            total_time = 0
            total_batches = 0
            for X_batch, Y_batch in batchiterator:
                print ("Length X_batch", len(X_batch))
                stime = time.time()
                tuned_model_rf.fit(X_batch, Y_batch)
                total_time += time.time() - stime
                print ("total time is ", total_time)
                total_batches += 1

            print(f'====>RandomForestClassifier Training Time with batch size {batch_size} is {total_time} secs')
            print(f'====>Average Training Time for {total_batches} batches is {total_time/total_batches} secs')

    elif inference == 'knn_model':
        X_test = X
        Y_test = Y
        loaded_model = load_model(knn_model)
        print("kNN model loaded successfully")
        total_time = 0
        for i in range(100):
            stime = time.time()
            predictions = loaded_model.predict(X_test)
            total_time += time.time() - stime
        print(f'====> KNeighborsClassifier Model Average Inference Time is {total_time/100} secs')
        print(f"====> Accuracy for kNN is: {100 * metrics.accuracy_score(Y_test, predictions)} % ")
        print(f"====> F1 score for kNN is: { metrics.f1_score(Y_test,predictions,average = 'micro')}")
                
    elif inference == 'dtc_model':
        X_test = X
        Y_test = Y
        loaded_model = load_model(dtc_model)
        print("Decision Tree Classifier model loaded successfully")
        stime = time.time()
        predictions = loaded_model.predict(X_test)
        print(f'====> Decision Tree Classifier Model Inference Time is {time.time()-stime} secs')
        print(f"====> Accuracy for DTC is: {100 * metrics.accuracy_score(Y_test, predictions)} % ")
        print(f"====> F1 score for DTC is: { metrics.f1_score(Y_test,predictions,average = 'micro')}")
            
    elif inference == 'rfc_model':
        X_test = X
        Y_test = Y
        loaded_model= load_model(rfc_model)
        print("Random Forest Classifier model loaded successfully")
        stime = time.time()
        predictions = loaded_model.predict(X_test)
        print(f'====> Random Forest Classifier Model Inference Time is {time.time()-stime} secs')
        print(f"====> Accuracy for RFC is: {100 * metrics.accuracy_score(Y_test, predictions)} % ")
        print(f"====> F1 score for RFC is: { metrics.f1_score(Y_test,predictions,average = 'micro')}")
    else:
        print("====> Please check whether the correct model file name is passed or not!")
               
print(f'====> Program exeuction time {time.time()-prgstime} secs')


INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU


Running KNeighborsClassifier ...


INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU
INFO:root:sklearn.neighbors.KNeighborsMixin.kneighbors: running accelerated version on CPU

====>KNeighborsClassifier Average Training Time with best tuned hyper parameters 0.031798996448516845 secs
====> Program exeuction time 3239.256328344345 secs


In [None]:
import matplotlib.pyplot as plt
import numpy as np
