<a href="https://colab.research.google.com/github/MonalisaFouzdar/Machine-Learning-/blob/master/Modeling_for_data_Cineplex_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modeling Cineplex Concessions

- Stephen W. Thomas
- October 2020
- For: MMA/GMMA/MMAI 869

This Notebook will build a basic ML classifier model. Please feel free to use and modify as you see fit.

To help with engineering features, this Notebook will create a custom class for fitting (training) to the training data and transforming the training and testing data. It's a bit complicated, and you do not have to do it this way.

In [None]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell    
InteractiveShell.ast_node_interactivity = "all" 

# Load in the data 

In [None]:
version_str = "_v1"
input_dir = "clean/"

In [None]:
visits = pd.read_csv(input_dir+"visits"+version_str+".csv")
locations = pd.read_csv(input_dir+"locations"+version_str+".csv", encoding = "ISO-8859-1")
items = pd.read_csv(input_dir+"items"+version_str+".csv", encoding = "ISO-8859-1")
visits_items = pd.read_csv(input_dir+"visit_items"+version_str+".csv", encoding = "ISO-8859-1")
films = pd.read_csv(input_dir+"films"+version_str+".csv", encoding = "ISO-8859-1")
experience_type = pd.read_csv(input_dir+"experience_type"+version_str+".csv", encoding = "ISO-8859-1")
sales_channels = pd.read_csv(input_dir+"sales_channels"+version_str+".csv", encoding = "ISO-8859-1")
timeslice = pd.read_csv(input_dir+"timeslice"+version_str+".csv", encoding = "ISO-8859-1")

In [None]:
films.shape

(25942, 16)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.float_format', lambda x: '%.1f' % x):
    display(visits.head())

Unnamed: 0,Visit_ID,CINEPLEX_D_Membership_ID,Visit_Date,Visit_TimeSlice_ID,Location_ID,Auditorium_Experience_ID,Film_ID,Seating_Experience_ID,Sales_Channel_ID,Performance_Experience_ID,Performance_Type_ID,Spend,Revenue,Points_Earned,Points_Redeemed,Discount,Has_US_Is_LrgPopcorn,Has_US_Is_Butter,Has_US_Is_Alcohol,Has_US_Is_Candy,Has_US_Is_Coffee,Has_US_Is_Kiddietray,Has_US_Is_Voucher,Has_US_Is_MenuFood
0,2,23777,2019-01-01,15,7253,1,28757,1,8,1,1,15.0,15.0,100,0,1.7,0,0,0,0,0,0,0,0
1,3,60723,2019-01-01,15,7123,0,0,0,8,0,0,29.6,29.6,297,0,3.3,0,1,0,0,0,0,1,1
2,4,61080,2019-01-01,15,7290,2,24899,2,8,2,1,28.4,28.4,235,0,3.2,1,1,0,0,0,0,0,0
3,5,103538,2019-01-01,14,1142,1,24977,1,2,1,1,0.0,10.0,0,1000,0.0,0,0,0,0,0,0,0,0
4,6,120196,2019-01-01,23,9181,1,29694,1,2,1,1,17.6,17.6,100,0,1.9,0,0,0,0,0,0,0,0


In [None]:
visits.shape
visits['Visit_ID'].nunique()

visits = visits.merge(films, how="left", on="Film_ID")
visits = visits.merge(experience_type, how="left")
visits = visits.merge(sales_channels, how="left")
visits = visits.merge(timeslice, left_on="Visit_TimeSlice_ID", right_on="TimeSlice_ID", how="left")
visits = visits.merge(locations, how="left")

visits.shape
visits['Visit_ID'].nunique()

(36258, 24)

36258

(36258, 60)

36258

In [None]:
# Fill in empty values on language
visits['Market_Language'] = visits['Market_Language'].fillna('Unknown')

In [None]:
# Extract some info about the date
visits['Visit_Date'] = pd.to_datetime(visits['Visit_Date'])
visits['Visit_Date_DOW'] = visits['Visit_Date'].dt.day_name()
visits['Visit_Date_Week'] = visits['Visit_Date'].dt.strftime('%U')
visits['Visit_Date_Month'] = visits['Visit_Date'].dt.month_name()
visits['Visit_Date_IsWeekend'] = np.where(visits['Visit_Date'].dt.weekday < 5, 0, 1)

In [None]:
#visits = visits.sort_values('Visit_Date')
visits['Visit_Date'].is_monotonic_increasing
visits['Visit_Date_Week'].is_monotonic_increasing

True

True

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.float_format', lambda x: '%.1f' % x):
    display(visits.head())

Unnamed: 0,Visit_ID,CINEPLEX_D_Membership_ID,Visit_Date,Visit_TimeSlice_ID,Location_ID,Auditorium_Experience_ID,Film_ID,Seating_Experience_ID,Sales_Channel_ID,Performance_Experience_ID,Performance_Type_ID,Spend,Revenue,Points_Earned,Points_Redeemed,Discount,Has_US_Is_LrgPopcorn,Has_US_Is_Butter,Has_US_Is_Alcohol,Has_US_Is_Candy,Has_US_Is_Coffee,Has_US_Is_Kiddietray,Has_US_Is_Voucher,Has_US_Is_MenuFood,Film_Title,Title_ID,Title_Name,Theatrical_Release_Date,Film_Runtime,Language,Market_Language,Release_Pattern,Performance_Type,Is_IMAX,Is_DBOX_Capable,Release_Status,Media_Class_Description,Is_4Dx,Genre_Hierarchy_1,Auditorium_Experience_Description,Is_Premium,Sales_Channel,TimeSlice_ID,TimeSlice_Description,TimeSlice_Start_Time,TimeSlice_End_Time,TimeSlice_Hour_12,TimeSlice_Hour_24,Location_Name,Location_Short_Name,Location_Interactive_Name,Address1,City,Province_CD,Postal_Code,Country_CD,Is_VIP,Is_Outtakes,Is_Licensed_Bar,Is_Licensed_Lounge,Visit_Date_DOW,Visit_Date_Week,Visit_Date_Month,Visit_Date_IsWeekend
0,2,23777,2019-01-01,15,7253,1,28757,1,8,1,1,15.0,15.0,100,0,1.7,0,0,0,0,0,0,0,0,Vice,23956.0,Vice,12/25/2018,133.0,English,English,Wide,Film Presentation,0,0,A,Adult,0.0,Drama,Regular,0,Point of Sale,15,1 - Early Matinee,00:00.0,30:00.0,1:00 PM,13,CPX Eglinton T.C.,Warden&Eglin,Cineplex Odeon Eglinton Town Centre Cinemas,22 Lebovic Avenue,Toronto,ON,M1L 4V9,CA,0,1,1,0,Tuesday,0,January,0
1,3,60723,2019-01-01,15,7123,0,0,0,8,0,0,29.6,29.6,297,0,3.3,0,1,0,0,0,0,1,1,<None>,0.0,<None>,1/1/1900,0.0,<None>,<None>,<None>,<None>,0,0,B,,0.0,Unknown,<Unknown>,0,Point of Sale,15,1 - Early Matinee,00:00.0,30:00.0,1:00 PM,13,CPX Winston Churchill VIP,Winston Churchill,Cineplex Cinemas Winston Churchill & VIP,2081 Winston Park Dr.,Oakville,ON,L6H 6P5,CA,1,1,1,0,Tuesday,0,January,0
2,4,61080,2019-01-01,15,7290,2,24899,2,8,2,1,28.4,28.4,235,0,3.2,1,1,0,0,0,0,0,0,Aquaman,22133.0,Aquaman,12/21/2018,143.0,English,English,Wide,Film Presentation,0,1,A,Adult,0.0,Adaptation,AVX,1,Point of Sale,15,1 - Early Matinee,00:00.0,30:00.0,1:00 PM,13,CPX Hamilton Mountain,CPX Hamilton,Cineplex Cinemas Hamilton Mountain,795 Paramount Drive,Stoney Creek,ON,L8J 0B4,CA,0,1,1,0,Tuesday,0,January,0
3,5,103538,2019-01-01,14,1142,1,24977,1,2,1,1,0.0,10.0,0,1000,0.0,0,0,0,0,0,0,0,0,Mary Poppins Returns,22187.0,Mary Poppins Returns,12/19/2018,131.0,English,English,Wide,Film Presentation,0,1,A,Family,0.0,Comedy,Regular,0,Kiosk,14,1 - Early Matinee,30:00.0,00:00.0,12:00 PM,12,CPX Pitt Meadows,Meadowtown,Cineplex Odeon Meadowtown Cinemas,"#410, 19800 Lougheed Highway",Pitt Meadows,BC,V3Y 2W1,CA,0,1,0,0,Tuesday,0,January,0
4,6,120196,2019-01-01,23,9181,1,29694,1,2,1,1,17.6,17.6,100,0,1.9,0,0,0,0,0,0,0,0,DeuxiÂme acte,23976.0,Second Act,12/21/2018,104.0,French,French,Wide,Film Presentation,0,0,A,Adult,0.0,Comedy,Regular,0,Kiosk,23,2 - Late Matinee,00:00.0,30:00.0,5:00 PM,17,CPX Beauport,Beauport,CinÂma Cineplex Odeon Beauport,825 rue ClÂmenceau,Beauport,QC,G1C 2K6,CA,0,0,0,0,Tuesday,0,January,0


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.float_format', lambda x: '%.1f' % x):
    display(visits.tail())

Unnamed: 0,Visit_ID,CINEPLEX_D_Membership_ID,Visit_Date,Visit_TimeSlice_ID,Location_ID,Auditorium_Experience_ID,Film_ID,Seating_Experience_ID,Sales_Channel_ID,Performance_Experience_ID,Performance_Type_ID,Spend,Revenue,Points_Earned,Points_Redeemed,Discount,Has_US_Is_LrgPopcorn,Has_US_Is_Butter,Has_US_Is_Alcohol,Has_US_Is_Candy,Has_US_Is_Coffee,Has_US_Is_Kiddietray,Has_US_Is_Voucher,Has_US_Is_MenuFood,Film_Title,Title_ID,Title_Name,Theatrical_Release_Date,Film_Runtime,Language,Market_Language,Release_Pattern,Performance_Type,Is_IMAX,Is_DBOX_Capable,Release_Status,Media_Class_Description,Is_4Dx,Genre_Hierarchy_1,Auditorium_Experience_Description,Is_Premium,Sales_Channel,TimeSlice_ID,TimeSlice_Description,TimeSlice_Start_Time,TimeSlice_End_Time,TimeSlice_Hour_12,TimeSlice_Hour_24,Location_Name,Location_Short_Name,Location_Interactive_Name,Address1,City,Province_CD,Postal_Code,Country_CD,Is_VIP,Is_Outtakes,Is_Licensed_Bar,Is_Licensed_Lounge,Visit_Date_DOW,Visit_Date_Week,Visit_Date_Month,Visit_Date_IsWeekend
36253,140987,9738340,2019-12-31,19,7269,1,27417,4,8,2,1,16.9,16.9,241,0,1.9,1,0,0,0,0,0,0,0,Star Wars: The Rise Of Skywalker,23106.0,Star Wars: The Rise Of Skywalker,12/20/2019,142.0,English,English,Wide,Film Presentation,0,1,A,Adult,1.0,Action,Regular,0,Point of Sale,19,2 - Late Matinee,00:00.0,30:00.0,3:00 PM,15,GLX Cambridge,G Cambridge,Galaxy Cinemas Cambridge,355 Hespeler Road,Cambridge,ON,N1R 6B3,CA,0,0,1,0,Tuesday,52,December,0
36254,140990,9765941,2019-12-31,32,1409,1,31976,1,12,1,1,0.0,21.5,0,2500,0.0,0,0,0,0,0,0,0,0,Good Newwz (Hindi w/e.s.t.),34575.0,Good Newwz (Hindi w/e.s.t.),12/27/2019,132.0,Hindi,English,Ltd,Film Presentation,0,0,B,Adult,0.0,Drama,Regular,0,Cineplex Mobile App,32,4 - Late Evening,30:00.0,00:00.0,9:00 PM,21,SC Riverport,SC Riverport,SilverCity Riverport Cinemas,14211 Entertainment Way,Richmond,BC,V6W 1K4,CA,0,1,0,0,Tuesday,52,December,0
36255,141016,10089968,2019-12-31,16,3403,1,24181,4,8,1,1,0.0,15.6,125,0,0.0,0,0,0,0,0,0,0,0,Frozen II,34772.0,Frozen II,11/22/2019,104.0,English,English,Wide,Film Presentation,0,1,A,Family,1.0,Drama,Regular,0,Point of Sale,16,1 - Early Matinee,30:00.0,00:00.0,1:00 PM,13,SBNK Edmonton,Scotia Edmonton,Scotiabank Theatre Edmonton,"#3030, 8882-170 Street",Edmonton,AB,T5T 4M2,CA,0,1,0,0,Tuesday,52,December,0
36256,141021,10147219,2019-12-31,21,7415,1,31521,1,8,1,1,22.4,22.4,171,0,2.5,0,0,0,0,0,0,0,1,Bombshell,34187.0,Bombshell,12/20/2019,109.0,English,English,Wide,Film Presentation,0,0,A,Adult,0.0,Drama,Regular,0,Point of Sale,21,2 - Late Matinee,00:00.0,30:00.0,4:00 PM,16,CPX Ancaster,CPX Ancaster,Cineplex Cinemas Ancaster,771 Golf Links Road,Ancaster,ON,L9G 3K9,CA,0,1,1,0,Tuesday,52,December,0
36257,141030,10260700,2019-12-31,31,9109,1,31897,1,8,1,1,0.0,32.2,0,3750,0.0,0,0,0,0,0,0,0,0,Dabangg 3 (Hindi w/e.s.t.),34490.0,Dabangg 3 (Hindi w/e.s.t.),12/20/2019,160.0,Hindi,English,Ltd,Film Presentation,0,0,B,Adult,0.0,Comedy,Regular,0,Point of Sale,31,4 - Late Evening,00:00.0,30:00.0,9:00 PM,21,CPX Montreal Forum,Montreal Forum,CinÂma Cineplex Forum,2313 St. Catherine St. West Suite 101,MontrÂal,QC,H3H 1N2,CA,0,0,0,0,Tuesday,52,December,0


In [None]:
visits.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36258 entries, 0 to 36257
Data columns (total 64 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Visit_ID                           36258 non-null  int64         
 1   CINEPLEX_D_Membership_ID           36258 non-null  int64         
 2   Visit_Date                         36258 non-null  datetime64[ns]
 3   Visit_TimeSlice_ID                 36258 non-null  int64         
 4   Location_ID                        36258 non-null  int64         
 5   Auditorium_Experience_ID           36258 non-null  int64         
 6   Film_ID                            36258 non-null  int64         
 7   Seating_Experience_ID              36258 non-null  int64         
 8   Sales_Channel_ID                   36258 non-null  int64         
 9   Performance_Experience_ID          36258 non-null  int64         
 10  Performance_Type_ID               

# Create Training Data

In [None]:
# Let's start creating the training data from visits. We'll save a copy as 'df'
# We don't want to keep every single column from visits. Some will not be helpful:
# - IDs are generally not helpful for ML models, so we will remove
# - Raw dates themselves are not useful
# - Some of the columns above won't be very predictive
# - Things like addresses and postal codes are probably not that predictive

# Need to remove everything we know about this visit
df = visits.copy()

remove_cols = ['Title_ID', 'Location_ID', 'Auditorium_Experience_ID', 'Sales_Channel_ID',
               
               'Visit_Date',
               
               'Spend', 'Revenue', 'Points_Earned', 'Points_Redeemed', 'Discount',
               
               'TimeSlice_Description', 'TimeSlice_Start_Time', 'TimeSlice_End_Time','TimeSlice_Hour_12', 'TimeSlice_Hour_24',
               
               'Film_ID', 'Film_Title', 'Title_Name', 'Theatrical_Release_Date', 'Release_Status', 'Release_Pattern',
               
               'Location_Name', 'Location_Short_Name', 'Location_Interactive_Name',
               
               'Address1', 'City', 'Postal_Code',
              ] 

df = df.drop(remove_cols, axis=1)

# Also, drop all the target columns.
target_cols = [col for col in list(visits.columns) if "Has_US" in col]
df = df.drop(target_cols, axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36258 entries, 0 to 36257
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Visit_ID                           36258 non-null  int64  
 1   CINEPLEX_D_Membership_ID           36258 non-null  int64  
 2   Visit_TimeSlice_ID                 36258 non-null  int64  
 3   Seating_Experience_ID              36258 non-null  int64  
 4   Performance_Experience_ID          36258 non-null  int64  
 5   Performance_Type_ID                36258 non-null  int64  
 6   Film_Runtime                       36258 non-null  float64
 7   Language                           36258 non-null  object 
 8   Market_Language                    36258 non-null  object 
 9   Performance_Type                   36258 non-null  object 
 10  Is_IMAX                            36258 non-null  object 
 11  Is_DBOX_Capable                    36258 non-null  obj

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.float_format', lambda x: '%.1f' % x):
    df.head()

Unnamed: 0,Visit_ID,CINEPLEX_D_Membership_ID,Visit_TimeSlice_ID,Seating_Experience_ID,Performance_Experience_ID,Performance_Type_ID,Film_Runtime,Language,Market_Language,Performance_Type,Is_IMAX,Is_DBOX_Capable,Media_Class_Description,Is_4Dx,Genre_Hierarchy_1,Auditorium_Experience_Description,Is_Premium,Sales_Channel,TimeSlice_ID,Province_CD,Country_CD,Is_VIP,Is_Outtakes,Is_Licensed_Bar,Is_Licensed_Lounge,Visit_Date_DOW,Visit_Date_Week,Visit_Date_Month,Visit_Date_IsWeekend
0,2,23777,15,1,1,1,133.0,English,English,Film Presentation,0,0,Adult,0.0,Drama,Regular,0,Point of Sale,15,ON,CA,0,1,1,0,Tuesday,0,January,0
1,3,60723,15,0,0,0,0.0,<None>,<None>,<None>,0,0,,0.0,Unknown,<Unknown>,0,Point of Sale,15,ON,CA,1,1,1,0,Tuesday,0,January,0
2,4,61080,15,2,2,1,143.0,English,English,Film Presentation,0,1,Adult,0.0,Adaptation,AVX,1,Point of Sale,15,ON,CA,0,1,1,0,Tuesday,0,January,0
3,5,103538,14,1,1,1,131.0,English,English,Film Presentation,0,1,Family,0.0,Comedy,Regular,0,Kiosk,14,BC,CA,0,1,0,0,Tuesday,0,January,0
4,6,120196,23,1,1,1,104.0,French,French,Film Presentation,0,0,Adult,0.0,Comedy,Regular,0,Kiosk,23,QC,CA,0,0,0,0,Tuesday,0,January,0


# Custom Preprocessing Transfomer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

# This is a custom function that will extract features for our dataset.
# First, it will use OHE to encode all categorical features
# Then, it will compute member-level aggregation statistics, such as:
# - The number of total visits
# - The number of differnet locations the member has visited
# - The number of times a member has purchased large popcorn
# - The number of times a member has purchased butter
# - ....
#
#
# This class takes special care to avoid target leakage, in that it only computes the
# member-level agg statistics for in the fit (training) phase. In the transform stage,
# this class will just look-up the previously-computed features. If a member is present in the 
# transform stage that was not present in the training phase, an average will be used.
#
# The user of this class must pass in the visits table on which the members aggregate features will be computed.
class MyTransformer():
    
    def __init__(self, visits_df):
        self.visits_df = visits_df
        self.visit_member_map = visits_df[['Visit_ID', 'CINEPLEX_D_Membership_ID' ]]
        
    # The fit method is where we will compute the aggregate features for each member, and save the results
    # for later (i.e., transform)
    def fit(self, X, y=None):
        
        # First, fit the OneHotEncoder and save for later.
        self.cat_attrs = X.select_dtypes(include=[np.object]).columns
        self.OHE = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)
        self.OHE.fit(X[self.cat_attrs])
        self.cat_feature_names = list(self.OHE.get_feature_names(self.cat_attrs))
        
        
        # Now, compute the member-level aggregation statistics.
        
        # Limit the visits to just the members in this training data that's being fit to.
        # Otherwise, there will be data leakage!
        self.visits_df =  self.visits_df[self.visits_df['Visit_ID'].isin(X['Visit_ID'])]
        
        # Define my own version of the 'mode' function, which will just return the first mode (in case of tie) and 
        # thus avoid an annoying error from groupby (whcih always wants just one value)
        my_mode = lambda x: pd.Series.mode(x)[0]
        
        # Use a simple group by to compute some aggregrate statistics. We can do anything we want here,
        # this is just an example!
        members_visits = self.visits_df.groupby('CINEPLEX_D_Membership_ID').agg(
        {
            'Visit_Date': ['count'],

            'Film_ID': [pd.Series.nunique, my_mode],
            'Film_Runtime': ['mean', 'max'],
            'Is_VIP': ['mean', 'max'],
            'Market_Language': [my_mode],
            'Genre_Hierarchy_1': [pd.Series.nunique, my_mode],

            'Spend': ['min', 'mean', 'max', 'sum'],
            'Discount': ['mean'],

            'Has_US_Is_LrgPopcorn': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_Butter': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_Alcohol': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_Candy': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_Kiddietray': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_Voucher': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_MenuFood': ['count', 'sum', 'mean', 'max'],
            'Has_US_Is_Coffee': ['count', 'sum', 'mean', 'max'],
        }).reset_index()
        
        # Pandas group-by creates a MultiIndex, which we don't want. The following few lines
        # will rename the columns of the dataframe to something more reasonbale.
        members_visits.columns = ["_".join(x) for x in members_visits.columns.ravel()]
        members_visits = members_visits.rename(columns={'CINEPLEX_D_Membership_ID_': "CINEPLEX_D_Membership_ID"})
        members_visits.columns = members_visits.columns.str.replace("<lambda_0>", "mode")
        members_visits.columns = members_visits.columns.str.replace("<lambda>", "mode")
        
        # We want to make sure we have all numeric data, so let's do OHE (dummies) to be sure.
        members_visits = pd.get_dummies(members_visits)
        
        # Save the results for later
        self.members_visits = members_visits
        
        # Save the names of the features for later
        self.memb_feature_names = list(self.members_visits.columns)
        
        return self
    
   
    def transform(self, X, y=None):
        # Get the OHE levels
        X_cat = pd.DataFrame(self.OHE.transform(X[self.cat_attrs]), columns=self.cat_feature_names)
        
        # Get the previously-computed member-level agg features.
        X_member_agg = pd.DataFrame(X["Visit_ID"])
        
        # Bring in the agg features for members that we previously computed (if available)
        X_member_agg = X_member_agg.merge(self.visit_member_map, how="inner", on="Visit_ID" )
        X_member_agg = X_member_agg.merge(self.members_visits, how="left", on="CINEPLEX_D_Membership_ID" )
            
        # For any members that didn't have agg features, then just fill them in with mean values.
        X_member_agg = X_member_agg.fillna(X_member_agg.mean())
        
        X_member_agg = X_member_agg.drop(['Visit_ID'], axis=1)
        
        X.reset_index(drop=True, inplace=True)
        X_cat.reset_index(drop=True, inplace=True)
        X_member_agg.reset_index(drop=True, inplace=True)
        X_transformed = pd.concat([X, X_cat, X_member_agg], axis=1, sort=False)
        
        X_transformed = X_transformed.drop(self.cat_attrs, axis=1)
        X_transformed = X_transformed.drop('Visit_ID', axis=1)
        
        return X_transformed

# Modeling

In [None]:
# The name of the column in visits that has the target value we want to predict
# Change this to whatever you need/want!
target_col = 'Has_US_Is_Candy'

X = df.copy()
y = visits[target_col]

In [None]:
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, log_loss, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit


# Feel free to try other algorithms!
clf = RandomForestClassifier(max_depth=10, class_weight="balanced_subsample", criterion="gini", random_state=0)

# This TimeSeriesSplit object will split the data n_splits time, making sure that
# the training data always occurs before the testing data.
# For more info, see https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
tscv = TimeSeriesSplit(n_splits=7)

myt = MyTransformer(visits)

i = 0
for train_index, test_index in tscv.split(X):
    i = i+1
    
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("\n===========================================================")
    print("Split {}".format(i))
    print(train_index)
    print(test_index)
    print("X_train shape: {}".format(X_train.shape))
    print("X_train weeks min/max: {}, {}".format(X_train['Visit_Date_Week'].min(), X_train['Visit_Date_Week'].max()))
    print("X_train IDs min/max: {}, {}".format(X_train['Visit_ID'].min(), X_train['Visit_ID'].max()))
    print("X_test shape: {}".format(X_test.shape))
    print("X_test  weeks min/max: {}, {}".format(X_test['Visit_Date_Week'].min(), X_test['Visit_Date_Week'].max()))
    print("X_train IDs min/max: {}, {}".format(X_test['Visit_ID'].min(), X_test['Visit_ID'].max()))
    
    
    # Using the training data, build our features and train the model.
    myt = myt.fit(X_train)
    features_train = myt.transform(X_train)
    clf = clf.fit(features_train, y_train)
    
    # Get the features for testing data, and predict.
    features_test =  myt.transform(X_test)
    features_test.to_csv('features_test.csv',)
    y_pred = clf.predict(features_test)
    
    # How did we do?
    print("\nF1 Score  = {:.2f}".format(f1_score(y_test, y_pred, average="macro")))
    print("Accuracy   = {:.2f}".format(accuracy_score(y_test, y_pred)))
    print("Kappa      = {:.2f}".format(cohen_kappa_score(y_test, y_pred)))
    print("Log Loss   = {:.2f}".format(log_loss(y_test, y_pred)))
    print("\nConfusion Matrix:")
    unique_label = np.unique([y_test, y_pred])
    cmtx = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
    print(cmtx)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred)) 


Split 1
[   0    1    2 ... 4531 4532 4533]
[4534 4535 4536 ... 9063 9064 9065]
X_train shape: (4534, 29)
X_train weeks min/max: 00, 07
X_train IDs min/max: 2, 9903
X_test shape: (4532, 29)
X_test  weeks min/max: 07, 14
X_train IDs min/max: 9907, 20439

F1 Score  = 0.61
Accuracy   = 0.73
Kappa      = 0.24
Log Loss   = 9.30

Confusion Matrix:
        pred:0  pred:1
true:0    2900     373
true:1     847     412

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.89      0.83      3273
           1       0.52      0.33      0.40      1259

    accuracy                           0.73      4532
   macro avg       0.65      0.61      0.61      4532
weighted avg       0.70      0.73      0.71      4532


Split 2
[   0    1    2 ... 9063 9064 9065]
[ 9066  9067  9068 ... 13595 13596 13597]
X_train shape: (9066, 29)
X_train weeks min/max: 00, 14
X_train IDs min/max: 2, 20439
X_test shape: (4532, 29)
X_test  weeks min/max: 14, 19
X_train

In [None]:
# Let's measure the importances of the features
feature_names = features_train.columns

imp = pd.DataFrame({'Feature': feature_names, 'Importance': clf.feature_importances_})
imp = imp.sort_values('Importance', ascending=False)
imp.head(20)

Unnamed: 0,Feature,Importance
195,Has_US_Is_Candy_mean,0.10873
194,Has_US_Is_Candy_sum,0.089408
91,Sales_Channel_Point of Sale,0.086363
196,Has_US_Is_Candy_max,0.074581
87,Sales_Channel_Kiosk,0.056132
180,Discount_mean,0.026386
89,Sales_Channel_Online (Cineplex.com),0.02632
178,Spend_max,0.025318
177,Spend_mean,0.024496
176,Spend_min,0.02001
