# Step 1: Gather Data

### *Import Libraries*

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline  

import warnings
warnings.filterwarnings('ignore')

### *Get data*

In [2]:
# save filepath to variable for easier access
ecommerce_file_path = 'train.csv'
test_file_path = 'test.csv'
submission_file_path = 'sample_submission.csv'

# read the data and store data in DataFrame titled train_data
train_data = pd.read_csv(ecommerce_file_path)
test_data = pd.read_csv(test_file_path)
submission_data = pd.read_csv(submission_file_path)

In [3]:
train_data.shape, test_data.shape, submission_data.shape

((10500, 5), (4500, 4), (4500, 2))

Train data has:
- 10,500 rows (observations)
- 5 columns (features)

Test data has:
- 4,500 rows (observations)
- 4 columns (features)

Submission data has:
- 4,500 rows 
- 2 columns 

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 5 columns):
session_id     10500 non-null object
startTime      10500 non-null object
endTime        10500 non-null object
ProductList    10500 non-null object
gender         10500 non-null object
dtypes: object(5)
memory usage: 410.3+ KB


In [5]:
train_data.describe()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
count,10500,10500,10500,10500,10500
unique,10500,8815,8803,9402,2
top,u13243,19/12/14 8:57,14/11/14 16:15,A00002/B00002/C00007/D00266/,female
freq,1,5,5,25,8192


In [6]:
train_data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


## Step 2: Exploratory Data Analysis

### TRAINING DATA - PreProcessing

### Categorical Data

#### Splitting Product List
Product list contains list of products viewed by the user in the given session and it also contains the category, sub category, sub-sub category and the product all encoded and separated with a slash symbol. Each consecutive product is separated with a semicolon.
- product
- category
- sub_category
- sub_sub_category


In [7]:
train_data['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

In [8]:
train_data['ProductListCount'] = train_data['ProductList'].apply(lambda x: len(x.split(";")))
train_data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,ProductListCount
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female,1
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female,3
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male,2


In [9]:
# Create product list - Separating each product to new row (;)
# Extract name from the string 
prod = train_data['ProductList'].str.split(';')
train_data = train_data.reindex(train_data.index.repeat(prod.apply(len)))
train_data['product_data'] = np.hstack(prod)
train_data.head()


Unnamed: 0,session_id,startTime,endTime,ProductList,gender,ProductListCount,product_data
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28435/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D02554/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28436/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28437/
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7,A00001/B00009/C00031/D29404/


In [10]:
#product_data list are separated by forward slash (/). They need to be split into product, category, category, sub_sub_category
#added other category to see if there is more
train_data[['category','sub_category','sub_sub_category','product','other']] = train_data.product_data.str.split('[/]', expand=True)
train_data.head()


Unnamed: 0,session_id,startTime,endTime,ProductList,gender,ProductListCount,product_data,category,sub_category,sub_sub_category,product,other
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28435/,A00002,B00003,C00006,D28435,
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D02554/,A00002,B00003,C00006,D02554,
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28436/,A00002,B00003,C00006,D28436,
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28437/,A00002,B00003,C00006,D28437,
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7,A00001/B00009/C00031/D29404/,A00001,B00009,C00031,D29404,


In [11]:
#Drop original productlist, product_data and other columns as they are not needed
train_data.drop(['ProductList', 'ProductListCount', 'product_data','other'], axis=1, inplace=True)

In [12]:
#Display data
train_data.head()

Unnamed: 0,session_id,startTime,endTime,gender,category,sub_category,sub_sub_category,product
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28435
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D02554
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28436
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28437
1,u10253,16/12/14 14:35,16/12/14 14:41,male,A00001,B00009,C00031,D29404


pandas provides the useful function values_counts() to count unique items – it returns a Series with the counts of unique values.

In [13]:
train_data['category'].value_counts()

A00002    14931
A00003     3822
A00001     3300
A00005      413
A00004      261
A00006      182
A00011      130
A00010       75
A00007       52
A00009       47
A00008       38
Name: category, dtype: int64

As per data set we have 11 categories of products

Pandas dataframe.nunique() function return Series with number of distinct observations over requested axis. If we set the value of axis to be 0, then it finds the total number of unique observations over the index axis. If we set the value of axis to be 1, then it find the total number of unique observations over the column axis. It also provides the feature to exclude the NaN values from the count of unique numbers.

In [14]:
#find the unique values across the column axis.
train_data.nunique()

session_id          10500
startTime            8815
endTime              8803
gender                  2
category               11
sub_category           85
sub_sub_category      360
product             16503
dtype: int64

In [15]:
train_data.shape

(23251, 8)

In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23251 entries, 0 to 10499
Data columns (total 8 columns):
session_id          23251 non-null object
startTime           23251 non-null object
endTime             23251 non-null object
gender              23251 non-null object
category            23251 non-null object
sub_category        23251 non-null object
sub_sub_category    23251 non-null object
product             23251 non-null object
dtypes: object(8)
memory usage: 1.6+ MB


### Features Engineering
Transform using dummy variables so sklearn can understand them.

To change the datatype of a date column to DateTime.
train_bank["birthdate"]= pd.to_datetime(train_bank["birthdate"])

To convert the whole date column to time in seconds without having to extract day, year, month.
train_bank["birth"]= [datetime.timestamp(i) for i in train_bank.birthdate]

#### DateTime Features

In [17]:
#Convert startTime and endTime to datetime format
train_data["startTime"]= pd.to_datetime(train_data["startTime"])
train_data["endTime"]= pd.to_datetime(train_data["endTime"])

#merged_train["closeddate"]= [datetime.timestamp(i) for i in merged_train.closeddate]



date_column= ['startTime', 'endTime']

def extract_date(train_data,cols):
    for x in cols:
        train_data[x +'_year'] = train_data[x].dt.year
        train_data[x +'_month'] = train_data[x].dt.month
        train_data[x +'_week'] = train_data[x].dt.week
        train_data[x +'_day'] = train_data[x].dt.day
        train_data[x +'_hour'] = train_data[x].dt.hour
        train_data[x +'_minute'] = train_data[x].dt.minute
        train_data[x +'_dayofweek'] = train_data[x].dt.dayofweek
          
extract_date(train_data, date_column)

train_data.columns

Index(['session_id', 'startTime', 'endTime', 'gender', 'category',
       'sub_category', 'sub_sub_category', 'product', 'startTime_year',
       'startTime_month', 'startTime_week', 'startTime_day', 'startTime_hour',
       'startTime_minute', 'startTime_dayofweek', 'endTime_year',
       'endTime_month', 'endTime_week', 'endTime_day', 'endTime_hour',
       'endTime_minute', 'endTime_dayofweek'],
      dtype='object')

In [18]:
train_data.head()

Unnamed: 0,session_id,startTime,endTime,gender,category,sub_category,sub_sub_category,product,startTime_year,startTime_month,...,startTime_hour,startTime_minute,startTime_dayofweek,endTime_year,endTime_month,endTime_week,endTime_day,endTime_hour,endTime_minute,endTime_dayofweek
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,female,A00002,B00003,C00006,D28435,2014,12,...,18,11,0,2014,12,51,15,18,12,0
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,female,A00002,B00003,C00006,D02554,2014,12,...,18,11,0,2014,12,51,15,18,12,0
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,female,A00002,B00003,C00006,D28436,2014,12,...,18,11,0,2014,12,51,15,18,12,0
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,female,A00002,B00003,C00006,D28437,2014,12,...,18,11,0,2014,12,51,15,18,12,0
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,male,A00001,B00009,C00031,D29404,2014,12,...,14,35,1,2014,12,51,16,14,41,1


In [19]:
train_data.startTime.head()

0   2014-12-15 18:11:00
0   2014-12-15 18:11:00
0   2014-12-15 18:11:00
0   2014-12-15 18:11:00
1   2014-12-16 14:35:00
Name: startTime, dtype: datetime64[ns]

##### Convert the difference between two datetime objects to seconds 
Converting the difference between two datetime.datetime objects to seconds returns the total amount of seconds between the two.

The dt.total_seconds() function is used to return total duration of each element expressed in seconds.
- Subtract the most recent datetime.datetime from the older datetime.datetime to create a datetime.timedelta that contains the time difference between the two. Use datetime.timedelta.total_seconds() to get the total amount of seconds.

https://www.kite.com/python/answers/how-to-convert-the-difference-between-two-datetime-objects-to-seconds-in-python

In [20]:
train_data['duration'] = (train_data['endTime']-train_data['startTime']).dt.total_seconds()

In [21]:
#Dropping previous date columns after etracting year, day, month and quarters
train_data.drop(columns=date_column,axis=1,inplace=True)

In [22]:
train_data.head()

Unnamed: 0,session_id,gender,category,sub_category,sub_sub_category,product,startTime_year,startTime_month,startTime_week,startTime_day,...,startTime_minute,startTime_dayofweek,endTime_year,endTime_month,endTime_week,endTime_day,endTime_hour,endTime_minute,endTime_dayofweek,duration
0,u16159,female,A00002,B00003,C00006,D28435,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,u16159,female,A00002,B00003,C00006,D02554,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,u16159,female,A00002,B00003,C00006,D28436,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,u16159,female,A00002,B00003,C00006,D28437,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
1,u10253,male,A00001,B00009,C00031,D29404,2014,12,51,16,...,35,1,2014,12,51,16,14,41,1,360.0


#### Missing Values

In [23]:
train_data.isnull().any()

session_id             False
gender                 False
category               False
sub_category           False
sub_sub_category       False
product                False
startTime_year         False
startTime_month        False
startTime_week         False
startTime_day          False
startTime_hour         False
startTime_minute       False
startTime_dayofweek    False
endTime_year           False
endTime_month          False
endTime_week           False
endTime_day            False
endTime_hour           False
endTime_minute         False
endTime_dayofweek      False
duration               False
dtype: bool

There are no null values in the train_data dataframe

In [24]:
#check duplicates
train_data.duplicated().sum()

0

### Visualizing Data

fig, ax=plt.subplots(figsize=(16,8))
ax.scatter(train_data['duration'], train_data['gender'])
ax.set_xlabel('duration')
ax.set_ylabel('gender')
plt.show()

plt.figure(figsize = (10,10))

plt.title('Gender of participants')

plt.hist(train_data['gender'], bins = 2, edgecolor = 'k')
#Add edge color to make it more visible
plt.show()

#Gender by Duration
plt.figure(figsize=(10,5)) #Resized graph
plt.title('Gender by Duration of product viewing')

#rotate labels
plt.xticks(rotation=75)
sns.barplot(x='gender',y='duration',data=train_data, ci=None, estimator=np.std); 
#specify specific color
#y-axis numbers are just mean

sns.pairplot(train_data,hue='gender');

### Encoding Categorical Variables

In [25]:
train_data.dtypes

session_id              object
gender                  object
category                object
sub_category            object
sub_sub_category        object
product                 object
startTime_year           int64
startTime_month          int64
startTime_week           int64
startTime_day            int64
startTime_hour           int64
startTime_minute         int64
startTime_dayofweek      int64
endTime_year             int64
endTime_month            int64
endTime_week             int64
endTime_day              int64
endTime_hour             int64
endTime_minute           int64
endTime_dayofweek        int64
duration               float64
dtype: object

In [26]:
# using .map to create dummy variables
train_data['gender'] = train_data.gender.map({'female':0, 'male':1})

In [28]:
train_data.head()

Unnamed: 0,session_id,gender,category,sub_category,sub_sub_category,product,startTime_year,startTime_month,startTime_week,startTime_day,...,startTime_minute,startTime_dayofweek,endTime_year,endTime_month,endTime_week,endTime_day,endTime_hour,endTime_minute,endTime_dayofweek,duration
0,u16159,0,A00002,B00003,C00006,D28435,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,u16159,0,A00002,B00003,C00006,D02554,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,u16159,0,A00002,B00003,C00006,D28436,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,u16159,0,A00002,B00003,C00006,D28437,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
1,u10253,1,A00001,B00009,C00031,D29404,2014,12,51,16,...,35,1,2014,12,51,16,14,41,1,360.0


#Check categorical data
cat_cols = train_data.columns[train_data.dtypes =="object"]
print(cat_cols)
train_data.loc[:,cat_cols].head()

In [29]:
from sklearn.model_selection import train_test_split
# Separate target from predictors
y=train_data.gender
X=train_data.drop('gender', axis=1) #drops the gender column


# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

#### Label Encoding

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [31]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['session_id', 'category', 'sub_category', 'sub_sub_category', 'product']


# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

from sklearn.preprocessing import LabelEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply label encoder 
label_encoder = LabelEncoder()
for col in set(good_label_cols):
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [32]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
  
for i in object_cols:
    train_data[i]= label_encoder.fit_transform(train_data[i]) 

In [33]:
train_data

Unnamed: 0,session_id,gender,category,sub_category,sub_sub_category,product,startTime_year,startTime_month,startTime_week,startTime_day,...,startTime_minute,startTime_dayofweek,endTime_year,endTime_month,endTime_week,endTime_day,endTime_hour,endTime_minute,endTime_dayofweek,duration
0,4324,0,1,2,5,13325,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,4324,0,1,2,5,1478,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,4324,0,1,2,5,13326,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
0,4324,0,1,2,5,13327,2014,12,51,15,...,11,0,2014,12,51,15,18,12,0,60.0
1,178,1,0,8,30,13753,2014,12,51,16,...,35,1,2014,12,51,16,14,41,1,360.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10498,4978,0,2,11,71,4879,2014,11,47,22,...,27,5,2014,11,47,22,11,27,5,0.0
10499,9212,0,1,0,9,1349,2014,12,51,19,...,11,4,2014,12,51,19,3,19,4,480.0
10499,9212,0,1,1,1,14996,2014,12,51,19,...,11,4,2014,12,51,19,3,19,4,480.0
10499,9212,0,1,0,58,10266,2014,12,51,19,...,11,4,2014,12,51,19,3,19,4,480.0


In [34]:
train_data.dtypes

session_id               int32
gender                   int64
category                 int32
sub_category             int32
sub_sub_category         int32
product                  int32
startTime_year           int64
startTime_month          int64
startTime_week           int64
startTime_day            int64
startTime_hour           int64
startTime_minute         int64
startTime_dayofweek      int64
endTime_year             int64
endTime_month            int64
endTime_week             int64
endTime_day              int64
endTime_hour             int64
endTime_minute           int64
endTime_dayofweek        int64
duration               float64
dtype: object

from sklearn.model_selection import train_test_split
# Separate target from predictors
X=train_data.drop('gender', axis=1) #drops the gender column
y=train_data['gender']

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
# Specify Model
train_model = DecisionTreeRegressor(random_state=1)
# Fit Model
melb_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = melb_model.predict(val_X)
validation_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(validation_mae))


# Using best value for max_leaf_nodes
melb_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
melb_model.fit(train_X, train_y)
val_predictions = melb_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

## Data Modeling

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [None]:
from sklearn.tree import DecisionTreeClassifier
DecisionTree_ML=DecisionTreeClassifier(criterion='entropy', splitter='random')
DecisionTree_ML.fit(X_train,y_train)
predictions=DecisionTree_ML.predict(X_test)
#library for reports
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier

RandomForest=RandomForestClassifier(n_estimators=10)
RandomForest.fit(X_train,y_train)

pred=RandomForest.predict(X_test)
pred

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred))

In [None]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

clf_Ada=AdaBoostClassifier(
DecisionTreeClassifier(), n_estimators=100)

clf_Ada.fit(X_train,y_train)
pred_Ada=clf_Ada.predict(X_test)
pred_Ada

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred_Ada))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
Model_KNN=KNeighborsClassifier(n_neighbors=2,weights='uniform', algorithm='kd_tree', leaf_size=15, p=2)
Model_KNN.fit(X_train,y_train)
predictions=Model_KNN.predict(X_test)
#library for reports
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

In [None]:
from sklearn.naive_bayes import GaussianNB
Model_GNB=GaussianNB(priors=None, var_smoothing=1e-06)
Model_GNB.fit(X_train,y_train)
predictions=Model_GNB.predict(X_test)
#library for reports
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

In [None]:
data['start_date'] = [d.date() for d in data['start_datetime']]
data['start_time'] = [d.time() for d in data['start_datetime']]
data['end_date'] = [d.date() for d in data['end_datetime']]
data['end_time'] = [d.time() for d in data['end_datetime']]