# Prediction of customers' travel pattern

- https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

- https://towardsdatascience.com/predicting-hotel-bookings-with-user-search-parameters-8c570ab24805

# 1)-Importing key modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
# For processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# For modeling building and tunning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# for deep learning if I will have time

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [4]:
# for evaluation

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
from datetime import date
import datetime as dt

# 2)-Loading data

In [6]:
df = pd.read_csv('model_data.csv')
df.shape

(45805, 8)

In [7]:
df.columns

Index(['event_type', 'origin', 'destination', 'distance', 'num_family',
       'ts_datetime', 'len_jour', 'ts_hour'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20
2,book,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23
3,book,BER,BCN,1498.817537,1,2017-04-27 15:17:50,3.0,15
4,book,DEL,BKK,2921.339028,4,2017-04-27 22:51:57,6.0,22


### a. creating 3-feature dataset

In [9]:
df_three_feat=df[["event_type","distance","num_family","len_jour"]]

In [10]:
df_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
0,search,5834.154716,7,6.0
1,book,6525.926149,4,21.0
2,book,469.781624,2,3.0


In [11]:
df_three_feat['event_type'] = df_three_feat.event_type.map({'search':0, 'book':1})
df_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
0,0,5834.154716,7,6.0
1,1,6525.926149,4,21.0
2,1,469.781624,2,3.0


### b. Creating all feature dataset

In [12]:
df_all = pd.read_csv('all_features.csv')
df_all.shape

(45805, 518)

In [13]:
df_all.head(3)

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour,origin_ADB,origin_ADL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_all_feat=df_all.drop(['origin','destination','ts_datetime'], axis=1)

In [15]:
df_all_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,5834.154716,7,6.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6525.926149,4,21.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,469.781624,2,3.0,23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we have both datasets aligned. So, let's concentrate on our problem of imbalanced classes

In [16]:
df_three_feat.event_type.value_counts()

0    43997
1     1808
Name: event_type, dtype: int64

In [17]:
df_all_feat.event_type.value_counts(normalize=True)

0    0.960528
1    0.039472
Name: event_type, dtype: float64

This is the problem. We have only 3.9% of our data is of booking class.

# 3)- Solution1: Under-sampling method

In [18]:
booking_indices = df_three_feat[df_three_feat.event_type == 1].index
random_indices = np.random.choice(booking_indices, len(df_three_feat.loc[df_three_feat.event_type == 1]), replace=False)
booking_sample = df_three_feat.loc[random_indices]

In [19]:
booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour
2328,1,422.981035,5,2.0
5507,1,2937.575383,1,21.0
4120,1,9115.414212,2,9.0
7366,1,670.157951,0,-66.0
2223,1,737.318313,2,4.0
...,...,...,...,...
7496,1,438.286394,1,2.0
5566,1,782.352978,1,5.0
9581,1,1009.622959,1,3.0
8259,1,840.752213,2,4.0


In [20]:
not_booking = df_three_feat[df_three_feat.event_type == 0].index
random_indices = np.random.choice(not_booking, sum(df_three_feat['event_type']), replace=False)
not_booking_sample = df_three_feat.loc[random_indices]

In [21]:
not_booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour
41334,0,1895.687723,1,0.0
11681,0,6381.757139,5,9.0
10704,0,4837.258168,2,9.0
11397,0,1345.774668,3,56.0
34306,0,6611.310005,1,3.0
...,...,...,...,...
30270,0,12034.138178,1,10.0
35910,0,1031.243222,1,5.0
4252,0,365.531414,1,0.0
4534,0,1475.489132,1,0.0


In [22]:
imb_three_feat = pd.concat([not_booking_sample, booking_sample], axis=0)

In [23]:
print("Percentage of search clicks: ", len(imb_three_feat[imb_three_feat.event_type == 0])/len(imb_three_feat))
print("Percentage of booking clicks: ", len(imb_three_feat[imb_three_feat.event_type == 1])/len(imb_three_feat))
print("Total number of records in resampled data: ", len(imb_three_feat))

Percentage of search clicks:  0.5
Percentage of booking clicks:  0.5
Total number of records in resampled data:  3616


In [24]:
#save dataset
imb_three_feat.to_csv('undersample_my_feat.csv',index=False)

## Alternative way of doing Imbalanced class: USING imblearn

In [25]:
df_imbalance=df[["event_type","distance","num_family","len_jour"]]
df_imbalance['event_type'] = df_imbalance.event_type.map({'search':0, 'book':1})

In [26]:
df_imbalance.head(2)

Unnamed: 0,event_type,distance,num_family,len_jour
0,0,5834.154716,7,6.0
1,1,6525.926149,4,21.0


In [27]:
booking = df_imbalance[df_imbalance['event_type']==1]

search = df_imbalance[df_imbalance['event_type']==0]

In [28]:
print(booking.shape,search.shape)

(1808, 4) (43997, 4)


In [29]:
X=df_imbalance[['distance','num_family','len_jour']]
y=df_imbalance['event_type']

In [30]:
print(X.shape, y.shape)

(45805, 3) (45805,)


In [31]:
from imblearn.under_sampling import NearMiss
# Implementing Undersampling for Handling Imbalanced 
nm = NearMiss()
X_under,y_under=nm.fit_sample(X,y)
X_under.shape,y_under.shape

((3616, 3), (3616,))

Same as above case

# 4)- Model Building

### 4.1)-Separate features

In [32]:
imb_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
41334,0,1895.687723,1,0.0
11681,0,6381.757139,5,9.0
10704,0,4837.258168,2,9.0


In [33]:
imb_three_feat.event_type.value_counts()

1    1808
0    1808
Name: event_type, dtype: int64

In [34]:
target=imb_three_feat["event_type"]

In [35]:
features=imb_three_feat[["distance","num_family","len_jour"]]

In [36]:
print(target.shape)
print(features.shape)

(3616,)
(3616, 3)


### 4.2)-Normalize data

In [37]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(features)

### 4.3)-train_test_split

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=0)

In [39]:
print(X_train.shape)
print(X_test.shape)

(2531, 3)
(1085, 3)


In [40]:
print(y_train.shape)
print(y_test.shape)

(2531,)
(1085,)


In [41]:
X_train

array([[-0.76451847,  0.12707222, -0.44687264],
       [-0.42816808, -0.73826329, -0.30558016],
       [ 1.81657913, -0.73826329,  1.46057585],
       ...,
       [-0.26060819,  0.99240773, -0.58816512],
       [-0.70723905, -0.73826329, -0.58816512],
       [-0.72945769, -0.73826329, -0.3762264 ]])

In [42]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [43]:
predictions_LR[:5]

array([1, 1, 0, 1, 1])

In [44]:
print(accuracy_score(y_test,predictions_LR))

0.5714285714285714


In [45]:
print(recall_score(y_test,predictions_LR))

0.8078358208955224


In [46]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.64      0.34      0.45       549
           1       0.54      0.81      0.65       536

    accuracy                           0.57      1085
   macro avg       0.59      0.57      0.55      1085
weighted avg       0.60      0.57      0.55      1085



And here we are, we have lower accuracy(54.37%) than our previous model(96.17%).

But, we have got some valuable results for our booking class now.

# 5)-Model with ALL_Features

In [47]:
df_all_feat.head(2)

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,5834.154716,7,6.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6525.926149,4,21.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
booking_indices = df_all_feat[df_all_feat.event_type == 1].index
random_indices = np.random.choice(booking_indices, len(df_all_feat.loc[df_all_feat.event_type == 1]), replace=False)
booking_sample = df_all_feat.loc[random_indices]

In [49]:
booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
7596,1,1334.259311,3,7.0,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6990,1,2069.081413,3,7.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
504,1,5819.395952,1,28.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
282,1,1666.194851,2,4.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4238,1,9295.936700,1,7.0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,1,2239.218982,2,21.0,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7158,1,614.303469,1,0.0,16,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6271,1,2827.370510,2,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6531,1,2094.134792,2,5.0,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
search = df_all_feat[df_all_feat.event_type == 0].index
random_indices = np.random.choice(search, sum(df_all_feat['event_type']), replace=False)
search_sample = df_all_feat.loc[random_indices]

In [51]:
search_sample

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
20941,0,471.272919,1,4.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39511,0,9437.922930,4,23.0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31676,0,1864.303022,2,5.0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16394,0,255.217059,2,2.0,21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24937,0,1106.816306,1,0.0,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10250,0,1332.420107,3,0.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36861,0,4881.354498,2,0.0,22,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43800,0,9520.187306,3,11.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9300,0,1966.977016,2,12.0,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
df_imb_all_feat = pd.concat([search_sample, booking_sample], axis=0)

In [53]:
df_imb_all_feat.shape

(3616, 515)

In [54]:
print("Percentage of search clicks: ", len(df_imb_all_feat[df_imb_all_feat.event_type == 0])/len(df_imb_all_feat))
print("Percentage of booking clicks: ", len(df_imb_all_feat[df_imb_all_feat.event_type == 1])/len(df_imb_all_feat))
print("Total number of records in resampled data: ", len(df_imb_all_feat))

Percentage of search clicks:  0.5
Percentage of booking clicks:  0.5
Total number of records in resampled data:  3616


In [55]:
#save dataset
df_imb_all_feat.to_csv('undersample_all_feat.csv',index=False)

### 5.a)- Model Building for all feature dataset

In [56]:
target=df_imb_all_feat["event_type"]
features=df_imb_all_feat.drop(['event_type'], axis=1)

In [57]:
print(target.shape)
print(features.shape)

(3616,)
(3616, 514)


In [58]:
X = StandardScaler().fit_transform(features)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=0)

In [60]:
print(X_train.shape)
print(X_test.shape)

(2531, 514)
(1085, 514)


In [61]:
print(y_train.shape)
print(y_test.shape)

(2531,)
(1085,)


In [62]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

### 5b. Evaluate Model


https://machinelearningmastery.com/tour-of-evaluation-metrics-for-imbalanced-classification/

In [63]:
print(accuracy_score(y_test,predictions_LR))

0.5686635944700461


In [64]:
print(recall_score(y_test,predictions_LR))

0.582089552238806


In [65]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.58      0.56      0.57       549
           1       0.56      0.58      0.57       536

    accuracy                           0.57      1085
   macro avg       0.57      0.57      0.57      1085
weighted avg       0.57      0.57      0.57      1085



**Results of two models**
- Model with Three selected Features:
    
Accuracy: 54% <br>
Pprecision: For 0 -> 60% , For 1 -> 53% <br>
recall: For 0 -> 0.30 , For 1 ->  79% <br>
f1-score: For 0 -> 40 % , For 1 -> 63% <br>


- For model with all features, we can see results above

**Which is better model out of two using under sample method**

Without going nerd, I ll consider F1-score as matrics to judge performance. I ll discuss in detail what evaluation matrics mean for us in next Notebook. For now, <br>
- F1-score = (2 * Precision * Recall) / (Precision + Recall) <br>

It is combination of both Precision and Recall. And we have 3- feature model that performs more consistent with F1-score. So, I ll use 3-features model if we select "under-sampling" as our solution.

# 6)- Solution 2: Over-Sampling

In [66]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [67]:
df_imbalance=df[["event_type","distance","num_family","len_jour"]]
df_imbalance['event_type'] = df_imbalance.event_type.map({'search':0, 'book':1})

In [68]:
booking = df_imbalance[df_imbalance['event_type']==1]

search = df_imbalance[df_imbalance['event_type']==0]

In [69]:
print(booking.shape,search.shape)

(1808, 4) (43997, 4)


In [70]:
X=df_imbalance[['distance','num_family','len_jour']]
y=df_imbalance['event_type']

In [71]:
print(X.shape, y.shape)

(45805, 3) (45805,)


In [72]:
# Implementing Oversampling for Handling Imbalanced 
smk = SMOTETomek(random_state=42)
X_over,y_over=smk.fit_sample(X,y)

In [73]:
X_over.shape,y_over.shape

((86368, 3), (86368,))

In [74]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Over-sampled dataset shape {}'.format(Counter(y_over)))

Original dataset shape Counter({0: 43997, 1: 1808})
Over-sampled dataset shape Counter({0: 43184, 1: 43184})


### 6a. Model with my_features

In [75]:
X = StandardScaler().fit_transform(X_over)

In [76]:
print(X.shape)
print(y_over.shape)

(86368, 3)
(86368,)


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y_over, test_size=0.2, random_state=0)

In [78]:
print(X_train.shape)
print(X_test.shape)

(69094, 3)
(17274, 3)


In [79]:
print(y_train.shape)
print(y_test.shape)

(69094,)
(17274,)


In [80]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [81]:
print(accuracy_score(y_test,predictions_LR))

0.5775153409748756


In [82]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.59      0.47      0.52      8545
           1       0.57      0.69      0.62      8729

    accuracy                           0.58     17274
   macro avg       0.58      0.58      0.57     17274
weighted avg       0.58      0.58      0.57     17274



### 6b.Model with all_features

In [83]:
df_all_feat.head(2)

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,5834.154716,7,6.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6525.926149,4,21.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
y_all=df_all_feat["event_type"]
X_all=df_all_feat.drop(['event_type'], axis=1)

In [85]:
print(X_all.shape)
print(y_all.shape)

(45805, 514)
(45805,)


In [86]:
smk = SMOTETomek(random_state=42)
X_over_all,y_over_all=smk.fit_sample(X_all,y_all)

In [87]:
X_over_all.shape,y_over_all.shape

((86726, 514), (86726,))

In [88]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(y_all)))
print('Over-sampled dataset shape {}'.format(Counter(y_over_all)))

Original dataset shape Counter({0: 43997, 1: 1808})
Over-sampled dataset shape Counter({0: 43363, 1: 43363})


In [89]:
X_over_ALL = StandardScaler().fit_transform(X_over_all)

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X_over_ALL, y_over_all, test_size=0.2, random_state=0)

In [91]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(69380, 514)
(17346, 514)
(69380,)
(17346,)


In [92]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [93]:
print(accuracy_score(y_test,predictions_LR))

0.8449786694338752


In [94]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86      8591
           1       0.95      0.73      0.83      8755

    accuracy                           0.84     17346
   macro avg       0.86      0.85      0.84     17346
weighted avg       0.87      0.84      0.84     17346



# 7)- Solution 3: Random Over-Sampling

In [95]:
df_imbalance=df[["event_type","distance","num_family","len_jour"]]
df_imbalance['event_type'] = df_imbalance.event_type.map({'search':0, 'book':1})

In [96]:
booking = df_imbalance[df_imbalance['event_type']==1]

search = df_imbalance[df_imbalance['event_type']==0]

In [97]:
print(booking.shape,search.shape)

(1808, 4) (43997, 4)


In [98]:
X=df_imbalance[['distance','num_family','len_jour']]
y=df_imbalance['event_type']

In [99]:
from imblearn.over_sampling import RandomOverSampler
oversample =  RandomOverSampler(sampling_strategy=0.75)

In [100]:
print(X.shape, y.shape)

(45805, 3) (45805,)


In [101]:
X_randos, y_randos = oversample.fit_sample(X, y)

In [102]:
X_randos.shape,y_randos.shape

((76994, 3), (76994,))

In [103]:
print('Original dataset shape {}'.format(Counter(y)))
print('Over-sampled dataset shape {}'.format(Counter(y_randos)))

Original dataset shape Counter({0: 43997, 1: 1808})
Over-sampled dataset shape Counter({0: 43997, 1: 32997})


### 7a. Model with my_features

In [104]:
X = StandardScaler().fit_transform(X_randos)

In [105]:
X.shape

(76994, 3)

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y_randos, test_size=0.2, random_state=0)

In [107]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(61595, 3)
(15399, 3)
(61595,)
(15399,)


In [108]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [109]:
print(accuracy_score(y_test,predictions_LR))

0.5744528865510747


In [110]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.57      0.99      0.73      8840
           1       0.52      0.01      0.02      6559

    accuracy                           0.57     15399
   macro avg       0.55      0.50      0.37     15399
weighted avg       0.55      0.57      0.43     15399



### 7b.Model with all_features

In [111]:
y_all=df_all_feat["event_type"]
X_all=df_all_feat.drop(['event_type'], axis=1)

In [112]:
oversample =  RandomOverSampler(sampling_strategy=0.75)

In [113]:
X_randos_all, y_randos_all = oversample.fit_sample(X_all, y_all)

In [114]:
X_randos_all.shape,y_randos_all.shape

((76994, 514), (76994,))

In [115]:
print('Original dataset shape {}'.format(Counter(y_all)))
print('Over-sampled dataset shape {}'.format(Counter(y_randos_all)))

Original dataset shape Counter({0: 43997, 1: 1808})
Over-sampled dataset shape Counter({0: 43997, 1: 32997})


In [116]:
X_scaled_all = StandardScaler().fit_transform(X_randos_all)

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_all, y_randos_all, test_size=0.2, random_state=0)

In [118]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(61595, 514)
(15399, 514)
(61595,)
(15399,)


In [119]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [120]:
print(accuracy_score(y_test,predictions_LR))

0.6347165400350672


In [121]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.66      0.76      0.70      8840
           1       0.59      0.47      0.52      6559

    accuracy                           0.63     15399
   macro avg       0.62      0.61      0.61     15399
weighted avg       0.63      0.63      0.63     15399



**END OF NOTEBOOK4**