# Data Import & Connection to BigQuery

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from google.cloud import bigquery
from itertools import product
from functions import *

# set display options to show all columns
pd.set_option('display.max_columns', None)
# Set the float format to display numbers without scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Set the client for future queries to BigQuery
client = bigquery.Client(project = "continente-lced-feup")



In [2]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=Dr2IITnrICIQrJOOzN9Z7Hmj4bmhJR&access_type=offline&code_challenge=WAX8C2fRYBfGpW4cfLnvzIqFWjlOuAmgl704PtbszGc&code_challenge_method=S256


Credentials saved to file: [C:\Users\luish\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Cannot find a quota project to add to ADC. You might receive a "quota exceeded" or "API not enabled" error. Run $ gcloud auth application-default set-quota-project to add a 

# Data Loading

In [43]:
query = client.query("""
   SELECT *
   FROM 
       tables_raw.dim_customer 
       JOIN tables_raw.fact_transaction USING(CUSTOMER_ACCOUNT_NR_MASK)
       JOIN tables_raw.dim_product USING(SKU)
    WHERE 
        SUBCAT_CD_EXT IN (140304, 50401, 30301, 20201, 10301, 80103, 60102, 60401, 30401, 10101, 100102, 100204, 140204, 
        20302, 30201, 50203, 90201, 170101, 80404, 50402, 70202, 100201, 80409, 10303, 20306, 80411, 10102, 20305, 60105, 80110, 
        140301, 30202, 90202, 100101, 80105, 80104, 50202, 50303, 70204, 60306, 80403, 10302, 10201, 80405, 170304, 170303, 80406, 
        140201, 60302, 30403, 30304, 20204, 170106, 140205, 10204, 60404, 50301, 50302, 20205, 60406, 20301, 80407, 20203, 70201, 100205,
        60106, 170302, 50201, 60301, 10205, 30203, 80401, 100202, 30302, 170111, 10202, 70203, 60303, 170109, 60403, 30402, 140302, 30208, 60307, 
        80107, 50403, 60103, 20307, 60305, 60101, 170307, 80414, 80415, 60405, 20303, 80402, 30204, 30206, 170310, 60304, 140206, 10203, 30205, 60107, 
        70206, 170108, 90203, 90204, 30207, 140303, 30303, 80408, 140202, 50304, 80101, 170313, 100203, 60402, 170305, 50305, 50404, 20202, 170110, 
        170105, 170112, 170301, 10206, 10208, 20304, 80102, 70205, 10207, 10305, 170309, 170114, 80111, 90206, 30306, 30305, 140203, 80413) 
        AND SEG_AGE_DSC = ']25;35]'
        AND QTY >= 0
    ORDER BY CUSTOMER_ACCOUNT_NR_MASK ASC, TIME_KEY DESC
   """)

df = query.result().to_dataframe() # Wait for the job to complete.

# Data Preparation (more to be done...)

In [44]:
df = df.drop(columns=['LOC_BRAND_CD','PROD_DSCNT_ISSUED_AMT','NET_SLS_AMT','TRANS_DSCNT_RAT_AMT','DIRECT_DSCNT_AMT',
                 'seg_lifestyle_dsc','SEG_AGE','SEG_AGE_DSC','seg_lifestage_dsc',
                 'UNIT_BASE_DSC_EXT','SUBCAT_DSC_EXT','BIZ_UNIT_DSC_EXT','DEPARTMENT_DSC_EXT',
                'PRODUCT_SHORT_DSC','BRAND_DSC','BRAND_TYPE_CD','CONVERSION_FACTOR','CAPACITY_UNIT','PRODUCT_DSC','SKU',
                'LOCATION_CD','GROSS_SLS_AMT','CP4','CAT_DSC_EXT','PRODUCT_KEY'])

In [45]:
df.isnull().sum()

CUSTOMER_ACCOUNT_NR_MASK         0
GENDER                      207890
FAMILY_MEMBERS              778011
seg_lifestyle_cd                 0
seg_lifestage_cd                 0
TIME_KEY                         0
TRANSACTION_ID_MASK              0
POS_TP_CD                        0
QTY                              0
UNIT_BASE_CD_EXT                 0
SUBCAT_CD_EXT                    0
CAT_CD_EXT                       0
BIZ_UNIT_CD_EXT                  0
DEPARTMENT_CD_EXT                0
PRICE_RANGE                      0
dtype: int64

# Feature Engineering & ML Dataset Development

In [46]:
# convert the 'TIME_KEY' column to datetime format
df['TIME_KEY'] = pd.to_datetime(df['TIME_KEY'], format='%Y%m%d')

# create new columns for the day, week, day of the week, month, quarter, and year
#df['DAY'] = df['TIME_KEY'].dt.day
#df['WEEK'] = df['TIME_KEY'].dt.week
#df['DOW'] = df['TIME_KEY'].dt.dayofweek
df['MONTH'] = df['TIME_KEY'].dt.month
df['QUARTER'] = df['TIME_KEY'].dt.quarter
df['SEMESTER'] = df['MONTH'].apply(semester)
df['YEAR'] = df['TIME_KEY'].dt.year

In [47]:
# Filter the dataframe by the customer's frequency score
df = filter_customers(df)
# Sort the dataframe (later suffled again)
df = df.sort_values(['TIME_KEY','TRANSACTION_ID_MASK','YEAR','MONTH'])

In [48]:
# get all unique values of customer_id, category_id, month and year
customer_ids = df['CUSTOMER_ACCOUNT_NR_MASK'].unique()
category_ids = df['SUBCAT_CD_EXT'].unique()
months = df['MONTH'].unique()
years = df['YEAR'].unique()

# create a new dataframe with all possible combinations of customer_id and category_id
ml_dataset = pd.DataFrame(list(product(customer_ids, category_ids, months, years)), 
                                    columns=['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT','MONTH','YEAR'])

# add the quarter and semester columns based on the month value
quarter_map = {1: 1, 2: 1, 3: 1, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3, 10: 4, 11: 4, 12: 4}
semester_map = {1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2, 12: 2}

ml_dataset['QUARTER'] = ml_dataset['MONTH'].map(quarter_map)
ml_dataset['SEMESTER'] = ml_dataset['MONTH'].map(semester_map)

In [9]:
# create a random sample of 1M rows to test the DM pipeline
#ml_dataset = ml_dataset.sample(n=1000000)
ml_dataset = ml_dataset.sort_values(['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT','YEAR','MONTH'])

In [10]:
# TOTAL NUMBER OF ORDERS

df['CUST_NUM_TRANSACTIONS_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_MONTH'] = df['CUST_NUM_TRANSACTIONS_MONTH'].astype(int)

df['CUST_NUM_TRANSACTIONS_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_QUARTER'] = df['CUST_NUM_TRANSACTIONS_QUARTER'].astype(int)

df['CUST_NUM_TRANSACTIONS_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_SEMESTER'] = df['CUST_NUM_TRANSACTIONS_SEMESTER'].astype(int)

df['CUST_NUM_TRANSACTIONS_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_YEAR'] = df['CUST_NUM_TRANSACTIONS_YEAR'].astype(int)

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_NUM_TRANSACTIONS', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_NUM_TRANSACTIONS', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_NUM_TRANSACTIONS', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_NUM_TRANSACTIONS', 'CUST', False)

In [11]:
# UNIQUE NUMBER OF SUBCATEGORIES BOUGHT

df['CUST_NUM_UNIQUE_SUBCAT_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_MONTH'] = df['CUST_NUM_UNIQUE_SUBCAT_MONTH'].astype(int)

df['CUST_NUM_UNIQUE_SUBCAT_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_QUARTER'] = df['CUST_NUM_UNIQUE_SUBCAT_QUARTER'].astype(int)

df['CUST_NUM_UNIQUE_SUBCAT_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_SEMESTER'] = df['CUST_NUM_UNIQUE_SUBCAT_SEMESTER'].astype(int)

df['CUST_NUM_UNIQUE_SUBCAT_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_YEAR'] = df['CUST_NUM_UNIQUE_SUBCAT_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)

In [12]:
# AVERAGE DAYS SINCE LAST CUSTOMER'S TRANSACTION

df = calculate_rolling_avg(df, 'MONTH', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')
df = calculate_rolling_avg(df, 'QUARTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')
df = calculate_rolling_avg(df, 'SEMESTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')
df = calculate_rolling_avg(df, 'YEAR', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)

In [13]:
''' ############ REVIEW LATER ############
# create an empty column for QTY in the new dataframe
ml_dataset['QTY'] = 0

# group the original dataframe by CUSTOMER_ACCOUNT_NR_MASK, SUBCAT_CD_EXT, and MONTH and calculate the sum of QTY
grouped_df = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'MONTH', 'YEAR'])['QTY'].sum().reset_index()

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_NUM_UNIQUE_SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_NUM_UNIQUE_SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_NUM_UNIQUE_SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_NUM_UNIQUE_SUBCAT', False)
'''

" ############ REVIEW LATER ############\n# create an empty column for QTY in the new dataframe\nml_dataset['QTY'] = 0\n\n# group the original dataframe by CUSTOMER_ACCOUNT_NR_MASK, SUBCAT_CD_EXT, and MONTH and calculate the sum of QTY\ngrouped_df = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'MONTH', 'YEAR'])['QTY'].sum().reset_index()\n\nml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_NUM_UNIQUE_SUBCAT', False)\nml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_NUM_UNIQUE_SUBCAT', False)\nml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_NUM_UNIQUE_SUBCAT', False)\nml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_NUM_UNIQUE_SUBCAT', False)\n"

In [14]:
# AVERAGE BASKET SIZE

df['CUST_NUM_SUBCAT_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_MONTH'] = df['CUST_NUM_SUBCAT_MONTH']
df['CUST_AVG_BASKET_SIZE_MONTH'] = df['CUST_NUM_SUBCAT_MONTH'] / df['CUST_NUM_TRANSACTIONS_MONTH']

df['CUST_NUM_SUBCAT_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_QUARTER'] = df['CUST_NUM_SUBCAT_QUARTER']
df['CUST_AVG_BASKET_SIZE_QUARTER'] = df['CUST_NUM_SUBCAT_QUARTER'] / df['CUST_NUM_TRANSACTIONS_QUARTER']

df['CUST_NUM_SUBCAT_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_SEMESTER'] = df['CUST_NUM_SUBCAT_SEMESTER']
df['CUST_AVG_BASKET_SIZE_SEMESTER'] = df['CUST_NUM_SUBCAT_SEMESTER'] / df['CUST_NUM_TRANSACTIONS_SEMESTER']

df['CUST_NUM_SUBCAT_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_YEAR'] = df['CUST_NUM_SUBCAT_YEAR']
df['CUST_AVG_BASKET_SIZE_YEAR'] = df['CUST_NUM_SUBCAT_YEAR'] / df['CUST_NUM_TRANSACTIONS_YEAR']


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_AVG_BASKET_SIZE', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_AVG_BASKET_SIZE', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_AVG_BASKET_SIZE', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_AVG_BASKET_SIZE', 'CUST', True)

Subcategory features

In [15]:
# TOTAL NUMBER OF ORDERS

df['SUBCAT_NUM_TRANSACTIONS_MONTH'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_MONTH'] = df['SUBCAT_NUM_TRANSACTIONS_MONTH'].astype(int)

df['SUBCAT_NUM_TRANSACTIONS_QUARTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_QUARTER'] = df['SUBCAT_NUM_TRANSACTIONS_QUARTER'].astype(int)

df['SUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df['SUBCAT_NUM_TRANSACTIONS_SEMESTER'].astype(int)

df['SUBCAT_NUM_TRANSACTIONS_YEAR'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_YEAR'] = df['SUBCAT_NUM_TRANSACTIONS_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)

In [16]:
# UNIQUE NUMBER OF CUSTOMERS WHO BOUGHT FROM A SUBCATEGORY

df['SUBCAT_NUM_UNIQUE_CUST_MONTH'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_MONTH'] = df['SUBCAT_NUM_UNIQUE_CUST_MONTH'].astype(int)

df['SUBCAT_NUM_UNIQUE_CUST_QUARTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_QUARTER'] = df['SUBCAT_NUM_UNIQUE_CUST_QUARTER'].astype(int)

df['SUBCAT_NUM_UNIQUE_CUST_SEMESTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_SEMESTER'] = df['SUBCAT_NUM_UNIQUE_CUST_SEMESTER'].astype(int)

df['SUBCAT_NUM_UNIQUE_CUST_YEAR'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_YEAR'] = df['SUBCAT_NUM_UNIQUE_CUST_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)

Customer-Subcategory features

In [17]:
# TOTAL NUMBER OF ORDERS FOR A SUBCATEGORY BY A SPECIFIC CUSTOMER

df['CUSTSUBCAT_NUM_TRANSACTIONS_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_MONTH'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_MONTH'].astype(int)

df['CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER'].astype(int)

df['CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER'].astype(int)

df['CUSTSUBCAT_NUM_TRANSACTIONS_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_YEAR'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)

In [18]:
# AVERAGE DAYS SINCE LAST CUSTOMER'S TRANSACTION

df = calculate_rolling_avg(df, 'MONTH', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')
df = calculate_rolling_avg(df, 'QUARTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')
df = calculate_rolling_avg(df, 'SEMESTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')
df = calculate_rolling_avg(df, 'YEAR', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)

Other features

In [19]:
query = client.query("""
   SELECT *
   FROM tables_raw.dim_customer
   """)

customer = query.result().to_dataframe() # Wait for the job to complete.

In [20]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
customer_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['GENDER']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['GENDER'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(customer_dict)

In [21]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
family_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['FAMILY_MEMBERS']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['FAMILY_MEMBERS'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(family_dict)

In [22]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
lifestyle_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['seg_lifestyle_cd']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['SEG_LIFESTYLE_CD'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(lifestyle_dict)

In [23]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
lifestage_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['seg_lifestage_cd']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['SEG_LIFESTAGE_CD'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(lifestage_dict)

In [24]:
query = client.query("""
   SELECT SUBCAT_CD_EXT, CAT_CD_EXT, PRICE_RANGE
   FROM tables_raw.dim_product
   """)

products = query.result().to_dataframe() # Wait for the job to complete.

In [25]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
lifestage_dict = dict(zip(products['SUBCAT_CD_EXT'], products['CAT_CD_EXT']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['CAT_CD_EXT'] = ml_dataset['SUBCAT_CD_EXT'].map(lifestage_dict)

Target features

In [26]:
ml_dataset = ml_dataset.sort_values(['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT','YEAR','MONTH'])
ml_dataset = compute_target(ml_dataset)

# Load dataset into BigQuery

In [28]:
# Shuffle the dataframe
ml_dataset = ml_dataset.sample(frac=1).reset_index(drop=True)

MemoryError: Unable to allocate 1.43 GiB for an array with shape (9, 21278592) and data type int64

In [31]:
ml_dataset

Unnamed: 0,CUSTOMER_ACCOUNT_NR_MASK,SUBCAT_CD_EXT,MONTH,YEAR,QUARTER,SEMESTER,CUST_NUM_TRANSACTIONS_MONTH,CUST_NUM_TRANSACTIONS_QUARTER,CUST_NUM_TRANSACTIONS_SEMESTER,CUST_NUM_TRANSACTIONS_YEAR,CUST_NUM_UNIQUE_SUBCAT_MONTH,CUST_NUM_UNIQUE_SUBCAT_QUARTER,CUST_NUM_UNIQUE_SUBCAT_SEMESTER,CUST_NUM_UNIQUE_SUBCAT_YEAR,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_QUARTER,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_SEMESTER,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_YEAR,CUST_AVG_BASKET_SIZE_MONTH,CUST_AVG_BASKET_SIZE_QUARTER,CUST_AVG_BASKET_SIZE_SEMESTER,CUST_AVG_BASKET_SIZE_YEAR,SUBCAT_NUM_TRANSACTIONS_MONTH,SUBCAT_NUM_TRANSACTIONS_QUARTER,SUBCAT_NUM_TRANSACTIONS_SEMESTER,SUBCAT_NUM_TRANSACTIONS_YEAR,SUBCAT_NUM_UNIQUE_CUST_MONTH,SUBCAT_NUM_UNIQUE_CUST_QUARTER,SUBCAT_NUM_UNIQUE_CUST_SEMESTER,SUBCAT_NUM_UNIQUE_CUST_YEAR,CUSTSUBCAT_NUM_TRANSACTIONS_MONTH,CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER,CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER,CUSTSUBCAT_NUM_TRANSACTIONS_YEAR,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_QUARTER,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_SEMESTER,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_YEAR,GENDER,FAMILY_MEMBERS,SEG_LIFESTYLE_CD,SEG_LIFESTAGE_CD,CAT_CD_EXT,TARGET
0,36467,10101,1,2021,1,1,1,1,5,12,1,1,15,33,0.00,0.00,35.50,28.55,1.00,1.00,4.20,6.33,1614,4775,9569,20584,1092,1967,2851,3996,0,0,1,3,0.00,0.00,0.00,106.00,M,5,2,4,101,0
1,36467,10101,2,2021,1,1,0,1,5,12,0,1,15,33,0.00,0.00,35.50,28.55,0.00,1.00,4.20,6.33,1543,4775,9569,20584,1015,1967,2851,3996,0,0,1,3,0.00,0.00,0.00,106.00,M,5,2,4,101,0
2,36467,10101,3,2021,1,1,0,1,5,12,0,1,15,33,0.00,0.00,35.50,28.55,0.00,1.00,4.20,6.33,1618,4775,9569,20584,1103,1967,2851,3996,0,0,1,3,0.00,0.00,0.00,106.00,M,5,2,4,101,0
3,36467,10101,4,2021,2,1,0,4,5,12,0,14,15,33,0.00,16.33,35.50,28.55,0.00,5.00,4.20,6.33,1479,4794,9569,20584,1083,2220,2851,3996,0,1,1,3,0.00,0.00,0.00,106.00,M,5,2,4,101,1
4,36467,10101,5,2021,2,1,2,4,5,12,8,14,15,33,20.00,16.33,35.50,28.55,6.50,5.00,4.20,6.33,1557,4794,9569,20584,1122,2220,2851,3996,1,1,1,3,0.00,0.00,0.00,106.00,M,5,2,4,101,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21278587,133624448,170313,8,2022,3,2,0,0,1,1,0,0,2,2,0.00,0.00,0.00,0.00,0.00,0.00,2.00,2.00,56,134,374,596,49,111,287,421,0,0,0,0,0.00,0.00,0.00,0.00,F,,4,6,1703,0
21278588,133624448,170313,9,2022,3,2,0,0,1,1,0,0,2,2,0.00,0.00,0.00,0.00,0.00,0.00,2.00,2.00,36,134,374,596,33,111,287,421,0,0,0,0,0.00,0.00,0.00,0.00,F,,4,6,1703,0
21278589,133624448,170313,10,2022,4,2,0,1,1,1,0,2,2,2,0.00,0.00,0.00,0.00,0.00,2.00,2.00,2.00,47,240,374,596,41,200,287,421,0,0,0,0,0.00,0.00,0.00,0.00,F,,4,6,1703,0
21278590,133624448,170313,11,2022,4,2,0,1,1,1,0,2,2,2,0.00,0.00,0.00,0.00,0.00,2.00,2.00,2.00,36,240,374,596,34,200,287,421,0,0,0,0,0.00,0.00,0.00,0.00,F,,4,6,1703,0


In [35]:
#### SAVE DATAFRAME TO BIGQUERY ####
client.load_table_from_dataframe(ml_dataset, 'tables_staging.df_models').result()

LoadJob<project=continente-lced-feup, location=europe-southwest1, id=dca1c420-968d-41cf-ad6d-284d50cfa12e>

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import svm, datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from tabulate import tabulate
from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn.linear_model import Perceptron
from timeit import timeit
from sklearn import datasets, tree
import datetime
import os
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
import pylab 
sns.set(style="ticks", color_codes=True, font_scale=1.5)
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.colors import ListedColormap
%matplotlib inline
import mpl_toolkits
from mpl_toolkits.mplot3d import Axes3D
#from graphviz import Source
from IPython.display import Image
from scipy.stats import skew, norm, probplot, boxcox, f_oneway
from scipy import interp
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
from sklearn import metrics, tree
from sklearn.preprocessing import LabelEncoder, label_binarize, StandardScaler, PolynomialFeatures, MinMaxScaler
##from imblearn.over_sampling import SMOTE
#from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, cross_val_predict, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score
from sklearn.linear_model import LogisticRegression
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.tree import DecisionTreeClassifier
#import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
#from mlxtend.classifier import StackingClassifier
from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
import scipy
from sklearn.model_selection import GridSearchCV
from google.cloud import bigquery
from sklearn.model_selection import TimeSeriesSplit

In [53]:
import warnings
from sklearn.exceptions import ConvergenceWarning
def RF(X_train, y_train, X_test, y_test):


    with warnings.catch_warnings():
        warnings.filterwarnings("ignore",
                                category=UserWarning)
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        RF = RandomForestClassifier()
        
        # LR = Pipeline([
        #         #('sel', select_fetaures(select_cols=list(shadow))),
        #       #  ('scl', StandardScaler()),
        #         #('lda', LDA(store_covariance=True)),
        #         ('gs', gs)
        #  ]) 

        grid_values = {'clf__criterion': ['gini', 'entropy']
                       ,'clf__n_estimators':  [1000]       
                       ,'clf__min_samples_split': [117]
                       ,'clf__max_depth': [10]}

        gs = GridSearchCV(estimator=RF, param_grid=grid_values, scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
        
        gs.fit(X_train,y_train)

        best_RF = RandomForestClassifier(**gs.best_params_)
        best_RF.fit(X_train, y_train)



        results = get_results(best_RF, gs ,'Random Forest', X_test, y_test, reasume=True)
        print(results)

In [54]:
#### CrossValidation do Ian
def cv_12months(df, model):
    timestamps = df['fulldate'].sort_values().unique()
    #display(timestamps.size)

    # find the minimum timestamp
    min_timestamp = timestamps[0]

    #store each of the metrics in an array for each one
    accuracy_array = np.array([])
    precision_array = np.array([])
    recall_array = np.array([])
    f1_array = np.array([])

    # iterate over the timestamps and check if the time difference is less than 12 months
    for ts in timestamps[:-3]:
        if ts + 13 > timestamps.max():
            print(f'Accuracy Mean of all iterations : {np.mean(accuracy_array)}')
            print(f'Precision Mean of all iterations : {np.mean(precision_array)}')
            print(f'Recall Mean of all iterations: {np.mean(recall_array)}')
            print(f'F1 mean of all iterations: {np.mean(f1_array)}')
            break
        else:
            print(f'Start Iteration {ts}\n')
            if (ts - min_timestamp).n < 11:
                print(f'Treino feito com os meses: {ts} até {ts+11}')       
                train = df[df['fulldate'].isin([ts, ts+1, ts+2, ts+3, ts+4, ts+5, ts+6, ts+7, ts+8, ts+9, ts+10, ts+11])]
                X_train_cv = train.drop(['TARGET','fulldate'], axis=1)
                

                y_train_cv = train['TARGET'].astype(int)
                
            else:
                print(f'Treino feito com os meses: {ts} até {ts+11}')
                
                train = df[df['fulldate'].isin([ts, ts+1, ts+2, ts+3, ts+4, ts+5, ts+6, ts+7, ts+8, ts+9, ts+10, ts+11])]

                print(train.shape)
            test = df[df['fulldate'] == ts+12]
            X_test_cv = test.drop(['TARGET', 'fulldate'], axis=1)
            y_test_cv = test['TARGET'].astype(int)
            print(f'Teste feito com o mês: {ts+12}')
            print(X_train_cv.info())
            print(y_train_cv.info())
            

            print(f'\nEnd Iteration {ts}\n')
            #print y_train_cv data type
            
            ### The Get_results go here.
            if model == 'RF':
                RF(X_train_cv, y_train_cv, X_test_cv, y_test_cv)
                #append accuracy variable to the accuracy array
                accuracy_array = np.append(accuracy_array, accuracy)
                #append precision variable to the precision array
                precision_array = np.append(precision_array, precision)
                #append recall variable to the recall array
                recall_array = np.append(recall_array, recall)
                #append f1 variable to the f1 array
                f1_array = np.append(f1_array, f1)

In [55]:
def get_results(model, gs ,name, data, true_labels, target_names = ['No buy', 'Buy'], results=None, reasume=False):

    if hasattr(model, 'layers'):
        param = wtp_dnn_model.history.params
        best = np.mean(history.history['val_accuracy'])
        predicted_labels = np.argmax(model.predict(data) , axis=-1)
        im_model = InMemoryModel(model.predict, examples=data, target_names=target_names)

    else:
        param = gs.best_params_
        best = gs.best_score_
        predicted_labels = model.predict(data).ravel()
        if hasattr(model, 'predict_proba'):
            im_model = InMemoryModel(model.predict_proba, examples=data, target_names=target_names)
        elif hasattr(clf, 'decision_function'):
            im_model = InMemoryModel(model.decision_function, examples=data, target_names=target_names)
        else: 
            print('Cannot use InMemoryModel as predict_proba is not available')
           
        
    print('Mean Best Accuracy: {:2.2%}'.format(best))
    print('-'*60)
    print('Best Parameters:')
    print(param)
    print('-'*60)
    
    y_pred = model.predict(data).ravel()
    
    display_model_performance_metrics(true_labels, predicted_labels = predicted_labels, target_names = target_names)
    if len(target_names)==2:
        ras = roc_auc_score(y_true=true_labels, y_score=y_pred)
    else:
        roc_auc_multiclass, ras = roc_auc_score_multiclass(y_true=true_labels, y_score=y_pred, target_names=target_names)
        print('\nROC AUC Score by Classes:\n',roc_auc_multiclass)
        print('-'*60)

    print('\n\n              ROC AUC Score: {:2.2%}'.format(ras))
    prob, score_roc, roc_auc = plot_model_roc_curve(model, data, true_labels, label_encoder=None, class_names=target_names)
    
    #interpreter = Interpretation(data, feature_names=cols)
    #plots = interpreter.feature_importance.plot_feature_importance(im_model, progressbar=False, n_jobs=1, ascending=True)
    
    r1 = pd.DataFrame([(prob, best, np.round(accuracy_score(true_labels, predicted_labels), 4), 
                         ras, roc_auc)], index = [name],
                         columns = ['Prob', 'CV Accuracy', 'Accuracy', 'ROC AUC Score', 'ROC Area'])
    if reasume:
        results = r1
    elif (name in results.index):        
        results.loc[[name], :] = r1
    else: 
        results = results.append(r1)
        
    return results


def roc_auc_score_multiclass(y_true, y_score, target_names, average = "macro"):

  #creating a set of all the unique classes using the actual class list
  unique_class = set(y_true)
  roc_auc_dict = {}
  mean_roc_auc = 0
  for per_class in unique_class:
    #creating a list of all the classes except the current class 
    other_class = [x for x in unique_class if x != per_class]

    #marking the current class as 1 and all other classes as 0
    new_y_true = [0 if x in other_class else 1 for x in y_true]
    new_y_score = [0 if x in other_class else 1 for x in y_score]
    num_new_y_true = sum(new_y_true)

    #using the sklearn metrics method to calculate the roc_auc_score
    roc_auc = roc_auc_score(new_y_true, new_y_score, average = average)
    roc_auc_dict[target_names[per_class]] = np.round(roc_auc, 4)
    mean_roc_auc += num_new_y_true * np.round(roc_auc, 4)
    
  mean_roc_auc = mean_roc_auc/len(y_true)  
  return roc_auc_dict, mean_roc_auc

def get_metrics(true_labels, predicted_labels):
    global accuracy
    global precision
    global recall
    global f1
    accuracy = metrics.accuracy_score(true_labels, predicted_labels)
    precision = metrics.precision_score(true_labels, predicted_labels, average='weighted')
    recall = metrics.recall_score(true_labels, predicted_labels, average='weighted')
    f1 = metrics.f1_score(true_labels, predicted_labels, average='weighted')
    
    print('Accuracy:  {:2.2%} '.format(accuracy))
    print('Precision: {:2.2%} '.format(precision))
    print('Recall:    {:2.2%} '.format(recall))
    print('F1 Score:  {:2.2%} '.format(f1))
    # #append results to arrays
    # np.append(accuracy_array, metrics.accuracy_score(true_labels, predicted_labels))
    # np.append(precision_array, metrics.precision_score(true_labels, predicted_labels, average='weighted'))
    # np.append(recall_array, metrics.recall_score(true_labels, predicted_labels, average='weighted'))
    # np.append(f1_array, metrics.f1_score(true_labels, predicted_labels, average='weighted'))
    
                        

def train_predict_model(classifier,  train_features, train_labels,  test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions    


def display_confusion_matrix(true_labels, predicted_labels, target_names):
    
    total_classes = len(target_names)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], target_names], codes=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], target_names], codes=level_labels)) 
    print(cm_frame) 
    
def display_classification_report(true_labels, predicted_labels, target_names):

    report = metrics.classification_report(y_true=true_labels, y_pred=predicted_labels, target_names=target_names) 
    print(report)
    
def display_model_performance_metrics(true_labels, predicted_labels, target_names):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)

def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
    
    ## Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    if hasattr(clf, 'classes_'):
        class_labels = clf.classes_
    elif label_encoder:
        class_labels = label_encoder.classes_
    elif class_names:
        class_labels = class_names
    else:
        raise ValueError('Unable to derive prediction classes, please specify class_names!')
    n_classes = len(class_labels)
   
    if n_classes == 2:
        if hasattr(clf, 'predict_proba'):
            prb = clf.predict_proba(features)
            if prb.shape[1] > 1:
                y_score = prb[:, prb.shape[1]-1] 
            else:
                y_score = clf.predict(features).ravel()
            prob = True
        elif hasattr(clf, 'decision_function'):
            y_score = clf.decision_function(features)
            prob = False
        else:
            print("\n")
            #raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
        
        fpr, tpr, _ = roc_curve(true_labels, y_score)      
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr, label='ROC curve (area = {0:3.2%})'.format(roc_auc), linewidth=2.5)
        
    elif n_classes > 2:
        if  hasattr(clf, 'clfs_'):
            y_labels = label_binarize(true_labels, classes=list(range(len(class_labels))))
        else:
            y_labels = label_binarize(true_labels, classes=class_labels)
        if hasattr(clf, 'predict_proba'):
            y_score = clf.predict_proba(features)
            prob = True
        elif hasattr(clf, 'decision_function'):
            y_score = clf.decision_function(features)
            prob = False
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
            
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_labels[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        ## Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_labels.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        ## Compute macro-average ROC curve and ROC area
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        # Finally average it and compute AUC
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        ## Plot ROC curves
        plt.figure(figsize=(12, 6))
        plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:2.2%})'
                       ''.format(roc_auc["micro"]), linewidth=3)

        plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:2.2%})'
                       ''.format(roc_auc["macro"]), linewidth=3)
  
        for i, label in enumerate(class_names):
            plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:2.2%})'
                                           ''.format(label, roc_auc[i]), linewidth=2, linestyle=':')
            
        roc_auc = roc_auc["macro"]   
    else:
        raise ValueError('Number of classes should be atleast 2 or more')
        
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([-0.01, 1.0])
    plt.ylim([0.0, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    
    return prob, y_score, roc_auc



class select_fetaures(object):  # BaseEstimator, TransformerMixin,
     def __init__(self, select_cols):
         self.select_cols_ = select_cols

     def fit(self, X, Y):
         pass

     def transform(self, X):
         return X.loc[:, self.select_cols_]

     def fit_transform(self, X, Y):
         self.fit(X, Y)
         df = self.transform(X)
         return df

     def __getitem__(self, x):
         return self.X[x], self.Y[x]

In [56]:
ml_dataset = ml_dataset.drop(columns=['GENDER','FAMILY_MEMBERS'])

In [None]:
df['fulldate'] = pd.to_datetime(df['MONTH'].astype(str) + '-' + df['YEAR'].astype(str))