In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, f1_score, plot_roc_curve, precision_recall_curve
from xgboost import XGBClassifier
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from imblearn.over_sampling import SMOTE
pd.set_option('display.max_columns',None)

In [2]:
def label(date,hour):
    return (f'{date}_{hour}')

label('2010-10-13', '10')

'2010-10-13_10'

In [3]:
train = pd.read_csv('data_Train.csv')
train['s2'] = train['Ids'].apply(lambda x: x.split('_')[0])
train['date'] = train['Ids'].apply(lambda x: x.split('_')[1])
train['date'] = train['date'].apply(lambda x: pd.to_datetime(x))
train['hari'] = train['date'].apply(lambda x: x.day_name())
train['hour'] = train['Ids'].apply(lambda x: x.split('_')[-1])
train['day_hour_x'] = [label(row[0], row[1]) for row in zip(train['hari'], train['hour'])]
train['id_day'] = [label(row[0], row[1]) for row in zip(train['s2'], train['hari'])]
train['id_day_hour'] = [label(row[0], row[1]) for row in zip(train['id_day'], train['hour'])]

In [4]:
train.head()

Unnamed: 0,Ids,Labels,s2,date,hari,hour,day_hour_x,id_day,id_day_hour
0,2e69e9384_2020-10-06_13,True,2e69e9384,2020-10-06,Tuesday,13,Tuesday_13,2e69e9384_Tuesday,2e69e9384_Tuesday_13
1,2e6992c7c_2020-10-02_17,True,2e6992c7c,2020-10-02,Friday,17,Friday_17,2e6992c7c_Friday,2e6992c7c_Friday_17
2,2e69ef474_2020-09-13_19,True,2e69ef474,2020-09-13,Sunday,19,Sunday_19,2e69ef474_Sunday,2e69ef474_Sunday_19
3,2e69c5fd4_2020-10-10_15,True,2e69c5fd4,2020-10-10,Saturday,15,Saturday_15,2e69c5fd4_Saturday,2e69c5fd4_Saturday_15
4,2e6992134_2020-09-12_11,True,2e6992134,2020-09-12,Saturday,11,Saturday_11,2e6992134_Saturday,2e6992134_Saturday_11


In [5]:
train['Labels'].replace({
    True:1,
    False:0
}, inplace=True)
train['Labels']=train['Labels'].astype('int64')

In [6]:
train.drop(['s2', 'date', 'hour','Ids','id_day','hari','id_day','id_day_hour'], axis=1, inplace=True)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71336 entries, 0 to 71335
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Labels      71336 non-null  int64 
 1   day_hour_x  71336 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [8]:
train['Labels'].value_counts(normalize=True)

1    0.661419
0    0.338581
Name: Labels, dtype: float64

In [9]:
train

Unnamed: 0,Labels,day_hour_x
0,1,Tuesday_13
1,1,Friday_17
2,1,Sunday_19
3,1,Saturday_15
4,1,Saturday_11
...,...,...
71331,0,Monday_10
71332,1,Tuesday_12
71333,1,Saturday_14
71334,0,Wednesday_9


In [10]:
df = pd.read_csv('irregularities.csv')
df = df.drop(columns="id"). drop_duplicates()

In [11]:
df.drop_duplicates(subset='detection_date_millis',keep='last',inplace=True)

In [12]:
df['detection_date_millis'] = pd.to_datetime(df['detection_date_millis'], unit='ms')
df['update_date_millis'] = pd.to_datetime(df['update_date_millis'], unit='ms')
df['hari'] = df['detection_date_millis'].dt.day_name()
df['hour'] = df['detection_date_millis'].dt.hour
df['date'] = df['detection_date_millis'].dt.date
df['day_hour']=[label(row[0], row[1]) for row in zip(df['hari'], df['hour'])]
df['gabungan']=[label(row[0], row[1]) for row in zip(df['s2token_center'], df['date'])]
df['supergabungan']=[label(row[0], row[1]) for row in zip(df['gabungan'], df['hour'])]
df.head()

Unnamed: 0,detection_date_millis,update_date_millis,street,city,is_highway,line,s2id_center,s2token_center,speed,regular_speed,delay_seconds,seconds,length,trend,type,severity,jam_level,drivers_count,alerts_count,n_thumbs_up,hari,hour,date,day_hour,gabungan,supergabungan
95,2020-10-30 03:49:58.572,2020-10-30 03:50:04.624,Dr Setiabudi,Bandung,t,"{""line"": [{""x"": 107.596684, ""y"": -6.851308}, {...",3344176481801601024,2e68e6cac,4.96,30.24,869,1009,1392,1,Large,5,4,16,0,0,Friday,3,2020-10-30,Friday_3,2e68e6cac_2020-10-30,2e68e6cac_2020-10-30_3
340,2020-09-10 10:26:05.313,2020-09-10 10:26:18.245,Hankam Raya,Bekasi,f,"{""line"": [{""x"": 106.920151, ""y"": -6.294828}, {...",3344365451772690432,2e6992a8c,6.98,15.06,654,860,1670,1,Medium,5,3,10,0,0,Thursday,10,2020-09-10,Thursday_10,2e6992a8c_2020-09-10,2e6992a8c_2020-09-10_10
371,2020-10-27 05:02:14.741,2020-10-27 05:02:20.154,Jenderal Ibrahim Adjie,Bandung,t,"{""line"": [{""x"": 107.642349, ""y"": -6.939236}, {...",3344178272802963456,2e68e86bc,2.98,9.37,656,812,674,1,Small,5,4,5,0,0,Tuesday,5,2020-10-27,Tuesday_5,2e68e86bc_2020-10-27,2e68e86bc_2020-10-27_5
549,2020-11-13 11:53:55.661,2020-11-13 11:54:00.371,Narogong Raya,Bekasi,t,"{""line"": [{""x"": 106.977703, ""y"": -6.346882}, {...",3344365033013379072,2e6992474,14.59,25.48,913,1441,5844,1,Medium,5,3,18,0,0,Friday,11,2020-11-13,Friday_11,2e6992474_2020-11-13,2e6992474_2020-11-13_11
550,2020-10-08 06:50:51.765,2020-10-08 06:52:12.441,Soekarno-Hatta (Jalur Lambat),Bandung,t,"{""line"": [{""x"": 107.650351, ""y"": -6.942854}, {...",3344177899140808704,2e68e814c,4.46,32.62,511,564,700,0,Medium,5,4,2,0,0,Thursday,6,2020-10-08,Thursday_6,2e68e814c_2020-10-08,2e68e814c_2020-10-08_6


In [13]:
df.nunique()

detection_date_millis    14830
update_date_millis       14727
street                     408
city                        21
is_highway                   2
line                      7998
s2id_center               1582
s2token_center            1582
speed                     1639
regular_speed             3270
delay_seconds             1753
seconds                   1925
length                    2654
trend                        3
type                         4
severity                     1
jam_level                    4
drivers_count               96
alerts_count                15
n_thumbs_up                 18
hari                         7
hour                        21
date                        84
day_hour                   129
gabungan                  8675
supergabungan            12450
dtype: int64

In [14]:
al = pd.read_csv('alerts.csv')
al = al.drop(columns="id"). drop_duplicates()

In [15]:
al.rename({'s2token_15':'s2token_center'},axis=1,inplace=True)
al.rename({'s2id_15':'s2id_center'},axis=1,inplace=True)

In [16]:
al.drop_duplicates(subset='pub_millis',keep='last',inplace=True)

In [17]:
al['pub_millis'] = pd.to_datetime(al['pub_millis'], unit='ms')
al['hari'] = al['pub_millis'].dt.day_name()
al['hour'] = al['pub_millis'].dt.hour
al['date'] = al['pub_millis'].dt.date
al['day_hour']=[label(row[0], row[1]) for row in zip(al['hari'], al['hour'])]
al['gabungan']=[label(row[0], row[1]) for row in zip(al['s2token_center'], al['date'])]
al['supergabungan']=[label(row[0], row[1]) for row in zip(al['gabungan'], al['hour'])]
al.head()

Unnamed: 0,pub_millis,s2id_center,s2token_center,road_type,street,city,magvar,reliability,report_description,report_rating,confidence,type,subtype,report_by_municipality_user,n_thumbs_up,longitude,latitude,hari,hour,date,day_hour,gabungan,supergabungan
4,2020-10-12 00:59:54,3344466709921660928,2e69eec0c,2,Tanjakan Kembar,Depok,310,5,,3,0,JAM,JAM_HEAVY_TRAFFIC,,,106.79395,-6.365677,Monday,0,2020-10-12,Monday_0,2e69eec0c_2020-10-12,2e69eec0c_2020-10-12_0
5,2020-10-08 02:00:52,3344177632852836352,2e68e7d6c,2,Gatot Subroto,Bandung,0,6,,0,0,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,107.627686,-6.924535,Thursday,2,2020-10-08,Thursday_2,2e68e7d6c_2020-10-08,2e68e7d6c_2020-10-08_2
9,2020-11-15 19:10:06,3344363254896918528,2e6990a94,1,,Cikarang,207,5,,0,0,WEATHERHAZARD,HAZARD_ON_ROAD_POT_HOLE,,,107.097238,-6.366284,Sunday,19,2020-11-15,Sunday_19,2e6990a94_2020-11-15,2e6990a94_2020-11-15_19
11,2020-09-16 02:00:52,3344175938488238080,2e68e64c4,7,Diponegoro,Bandung,0,6,,0,0,ROAD_CLOSED,ROAD_CLOSED_EVENT,,0.0,107.618191,-6.901316,Wednesday,2,2020-09-16,Wednesday_2,2e68e64c4_2020-09-16,2e68e64c4_2020-09-16_2
14,2020-11-14 06:56:09,3344364199789723648,2e6991854,1,Cluster Citrine,Bekasi,97,5,,0,0,JAM,JAM_MODERATE_TRAFFIC,,,107.016695,-6.324721,Saturday,6,2020-11-14,Saturday_6,2e6991854_2020-11-14,2e6991854_2020-11-14_6


In [18]:
al.nunique()

pub_millis                     55193
s2id_center                     4317
s2token_center                  4317
road_type                         10
street                          2138
city                              20
magvar                           360
reliability                        6
report_description                 0
report_rating                      6
confidence                         6
type                               4
subtype                           26
report_by_municipality_user        0
n_thumbs_up                        1
longitude                      45708
latitude                       44942
hari                               7
hour                              24
date                              84
day_hour                         168
gabungan                       32952
supergabungan                  45924
dtype: int64

In [19]:
# df['date'] = df['date'].astype('str')
# df['hour'] = df['hour'].astype('str')

In [20]:
# df['id'] = [label(row[0], row[1]) for row in zip(df['date'], df['hour'])]

In [21]:
df_gabung = pd.merge(df, al.drop_duplicates(subset=['supergabungan'],keep='first'), how='left', on='supergabungan' )
df_gabung.head()

Unnamed: 0,detection_date_millis,update_date_millis,street_x,city_x,is_highway,line,s2id_center_x,s2token_center_x,speed,regular_speed,delay_seconds,seconds,length,trend,type_x,severity,jam_level,drivers_count,alerts_count,n_thumbs_up_x,hari_x,hour_x,date_x,day_hour_x,gabungan_x,supergabungan,pub_millis,s2id_center_y,s2token_center_y,road_type,street_y,city_y,magvar,reliability,report_description,report_rating,confidence,type_y,subtype,report_by_municipality_user,n_thumbs_up_y,longitude,latitude,hari_y,hour_y,date_y,day_hour_y,gabungan_y
0,2020-10-30 03:49:58.572,2020-10-30 03:50:04.624,Dr Setiabudi,Bandung,t,"{""line"": [{""x"": 107.596684, ""y"": -6.851308}, {...",3344176481801601024,2e68e6cac,4.96,30.24,869,1009,1392,1,Large,5,4,16,0,0,Friday,3,2020-10-30,Friday_3,2e68e6cac_2020-10-30,2e68e6cac_2020-10-30_3,2020-10-30 03:17:52,3.344176e+18,2e68e6cac,7.0,Dr Setiabudi,Bandung,354.0,6.0,,1.0,0.0,JAM,JAM_HEAVY_TRAFFIC,,,107.598537,-6.848655,Friday,3.0,2020-10-30,Friday_3,2e68e6cac_2020-10-30
1,2020-09-10 10:26:05.313,2020-09-10 10:26:18.245,Hankam Raya,Bekasi,f,"{""line"": [{""x"": 106.920151, ""y"": -6.294828}, {...",3344365451772690432,2e6992a8c,6.98,15.06,654,860,1670,1,Medium,5,3,10,0,0,Thursday,10,2020-09-10,Thursday_10,2e6992a8c_2020-09-10,2e6992a8c_2020-09-10_10,NaT,,,,,,,,,,,,,,,,,,,,,
2,2020-10-27 05:02:14.741,2020-10-27 05:02:20.154,Jenderal Ibrahim Adjie,Bandung,t,"{""line"": [{""x"": 107.642349, ""y"": -6.939236}, {...",3344178272802963456,2e68e86bc,2.98,9.37,656,812,674,1,Small,5,4,5,0,0,Tuesday,5,2020-10-27,Tuesday_5,2e68e86bc_2020-10-27,2e68e86bc_2020-10-27_5,2020-10-27 05:41:49,3.344178e+18,2e68e86bc,7.0,Jenderal Ibrahim Adjie,Bandung,183.0,5.0,,0.0,0.0,JAM,JAM_HEAVY_TRAFFIC,,,107.642123,-6.942849,Tuesday,5.0,2020-10-27,Tuesday_5,2e68e86bc_2020-10-27
3,2020-11-13 11:53:55.661,2020-11-13 11:54:00.371,Narogong Raya,Bekasi,t,"{""line"": [{""x"": 106.977703, ""y"": -6.346882}, {...",3344365033013379072,2e6992474,14.59,25.48,913,1441,5844,1,Medium,5,3,18,0,0,Friday,11,2020-11-13,Friday_11,2e6992474_2020-11-13,2e6992474_2020-11-13_11,NaT,,,,,,,,,,,,,,,,,,,,,
4,2020-10-08 06:50:51.765,2020-10-08 06:52:12.441,Soekarno-Hatta (Jalur Lambat),Bandung,t,"{""line"": [{""x"": 107.650351, ""y"": -6.942854}, {...",3344177899140808704,2e68e814c,4.46,32.62,511,564,700,0,Medium,5,4,2,0,0,Thursday,6,2020-10-08,Thursday_6,2e68e814c_2020-10-08,2e68e814c_2020-10-08_6,NaT,,,,,,,,,,,,,,,,,,,,,


In [22]:
df_gabung.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14830 entries, 0 to 14829
Data columns (total 48 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   detection_date_millis        14830 non-null  datetime64[ns]
 1   update_date_millis           14830 non-null  datetime64[ns]
 2   street_x                     14724 non-null  object        
 3   city_x                       14830 non-null  object        
 4   is_highway                   14830 non-null  object        
 5   line                         14830 non-null  object        
 6   s2id_center_x                14830 non-null  int64         
 7   s2token_center_x             14830 non-null  object        
 8   speed                        14830 non-null  float64       
 9   regular_speed                14830 non-null  float64       
 10  delay_seconds                14830 non-null  int64         
 11  seconds                      14830 non-nu

In [23]:
df_gabung.isna().sum().sort_values(ascending=False)

report_description             14830
report_by_municipality_user    14830
n_thumbs_up_y                  14343
subtype                        12371
street_y                       12283
day_hour_y                     12207
pub_millis                     12207
s2id_center_y                  12207
s2token_center_y               12207
road_type                      12207
city_y                         12207
magvar                         12207
reliability                    12207
gabungan_y                     12207
report_rating                  12207
latitude                       12207
confidence                     12207
hour_y                         12207
hari_y                         12207
date_y                         12207
longitude                      12207
type_y                         12207
street_x                         106
length                             0
update_date_millis                 0
city_x                             0
is_highway                         0
l

In [24]:
df_gabung.drop(['report_by_municipality_user','report_description','pub_millis','s2id_center_y','s2token_center_y','gabungan_y','latitude','longitude','update_date_millis','s2id_center_x','s2token_center_x','line','gabungan_x','detection_date_millis'],axis=1,inplace=True)

In [25]:
df_gabung.nunique()

street_x           408
city_x              21
is_highway           2
speed             1639
regular_speed     3270
delay_seconds     1753
seconds           1925
length            2654
trend                3
type_x               4
severity             1
jam_level            4
drivers_count       96
alerts_count        15
n_thumbs_up_x       18
hari_x               7
hour_x              21
date_x              84
day_hour_x         129
supergabungan    12450
road_type            8
street_y           337
city_y              17
magvar             341
reliability          6
report_rating        6
confidence           3
type_y               4
subtype             18
n_thumbs_up_y        1
hari_y               7
hour_y              19
date_y              81
day_hour_y         109
dtype: int64

In [26]:
df_gabung

Unnamed: 0,street_x,city_x,is_highway,speed,regular_speed,delay_seconds,seconds,length,trend,type_x,severity,jam_level,drivers_count,alerts_count,n_thumbs_up_x,hari_x,hour_x,date_x,day_hour_x,supergabungan,road_type,street_y,city_y,magvar,reliability,report_rating,confidence,type_y,subtype,n_thumbs_up_y,hari_y,hour_y,date_y,day_hour_y
0,Dr Setiabudi,Bandung,t,4.96,30.24,869,1009,1392,1,Large,5,4,16,0,0,Friday,3,2020-10-30,Friday_3,2e68e6cac_2020-10-30_3,7.0,Dr Setiabudi,Bandung,354.0,6.0,1.0,0.0,JAM,JAM_HEAVY_TRAFFIC,,Friday,3.0,2020-10-30,Friday_3
1,Hankam Raya,Bekasi,f,6.98,15.06,654,860,1670,1,Medium,5,3,10,0,0,Thursday,10,2020-09-10,Thursday_10,2e6992a8c_2020-09-10_10,,,,,,,,,,,,,,
2,Jenderal Ibrahim Adjie,Bandung,t,2.98,9.37,656,812,674,1,Small,5,4,5,0,0,Tuesday,5,2020-10-27,Tuesday_5,2e68e86bc_2020-10-27_5,7.0,Jenderal Ibrahim Adjie,Bandung,183.0,5.0,0.0,0.0,JAM,JAM_HEAVY_TRAFFIC,,Tuesday,5.0,2020-10-27,Tuesday_5
3,Narogong Raya,Bekasi,t,14.59,25.48,913,1441,5844,1,Medium,5,3,18,0,0,Friday,11,2020-11-13,Friday_11,2e6992474_2020-11-13_11,,,,,,,,,,,,,,
4,Soekarno-Hatta (Jalur Lambat),Bandung,t,4.46,32.62,511,564,700,0,Medium,5,4,2,0,0,Thursday,6,2020-10-08,Thursday_6,2e68e814c_2020-10-08_6,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14825,Exit 24: Cibitung/Kws Industri,Cibitung (Bekasi),f,2.76,36.20,1237,1318,1012,1,Large,5,4,4,0,0,Monday,1,2020-10-26,Monday_1,2e698fddc_2020-10-26_1,,,,,,,,,,,,,,
14826,Exit Baranangsiang,Bogor,f,4.92,62.00,384,418,572,0,Small,5,4,12,0,0,Thursday,4,2020-10-29,Thursday_4,2e69c5d94_2020-10-29_4,,,,,,,,,,,,,,
14827,N6 Cinangka Raya,Depok,t,17.91,32.05,597,906,4512,-1,Small,5,3,8,0,0,Saturday,12,2020-10-17,Saturday_12,2e69ef4a4_2020-10-17_12,,,,,,,,,,,,,,
14828,Dr Djundjunan,Bandung,t,5.23,11.03,1155,1280,1862,1,Small,5,4,22,0,0,Saturday,3,2020-10-24,Saturday_3,2e68e665c_2020-10-24_3,,,,,,,,,,,,,,


In [27]:
df_gabung['street_x'].fillna(df_gabung['street_y'],inplace=True)
df_gabung['city_x'].fillna(df_gabung['city_y'],inplace=True)
df_gabung['n_thumbs_up_x'].fillna(df_gabung['n_thumbs_up_y'],inplace=True)
df_gabung['hari_x'].fillna(df_gabung['hari_y'],inplace=True)
df_gabung['hour_x'].fillna(df_gabung['hour_y'],inplace=True)
df_gabung['date_x'].fillna(df_gabung['date_y'],inplace=True)
df_gabung['day_hour_x'].fillna(df_gabung['day_hour_y'],inplace=True)
df_gabung.drop(['street_y','city_y','n_thumbs_up_y','hari_y','hour_y','date_y','day_hour_y'],axis=1,inplace=True)

In [28]:
df_gabung

Unnamed: 0,street_x,city_x,is_highway,speed,regular_speed,delay_seconds,seconds,length,trend,type_x,severity,jam_level,drivers_count,alerts_count,n_thumbs_up_x,hari_x,hour_x,date_x,day_hour_x,supergabungan,road_type,magvar,reliability,report_rating,confidence,type_y,subtype
0,Dr Setiabudi,Bandung,t,4.96,30.24,869,1009,1392,1,Large,5,4,16,0,0,Friday,3,2020-10-30,Friday_3,2e68e6cac_2020-10-30_3,7.0,354.0,6.0,1.0,0.0,JAM,JAM_HEAVY_TRAFFIC
1,Hankam Raya,Bekasi,f,6.98,15.06,654,860,1670,1,Medium,5,3,10,0,0,Thursday,10,2020-09-10,Thursday_10,2e6992a8c_2020-09-10_10,,,,,,,
2,Jenderal Ibrahim Adjie,Bandung,t,2.98,9.37,656,812,674,1,Small,5,4,5,0,0,Tuesday,5,2020-10-27,Tuesday_5,2e68e86bc_2020-10-27_5,7.0,183.0,5.0,0.0,0.0,JAM,JAM_HEAVY_TRAFFIC
3,Narogong Raya,Bekasi,t,14.59,25.48,913,1441,5844,1,Medium,5,3,18,0,0,Friday,11,2020-11-13,Friday_11,2e6992474_2020-11-13_11,,,,,,,
4,Soekarno-Hatta (Jalur Lambat),Bandung,t,4.46,32.62,511,564,700,0,Medium,5,4,2,0,0,Thursday,6,2020-10-08,Thursday_6,2e68e814c_2020-10-08_6,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14825,Exit 24: Cibitung/Kws Industri,Cibitung (Bekasi),f,2.76,36.20,1237,1318,1012,1,Large,5,4,4,0,0,Monday,1,2020-10-26,Monday_1,2e698fddc_2020-10-26_1,,,,,,,
14826,Exit Baranangsiang,Bogor,f,4.92,62.00,384,418,572,0,Small,5,4,12,0,0,Thursday,4,2020-10-29,Thursday_4,2e69c5d94_2020-10-29_4,,,,,,,
14827,N6 Cinangka Raya,Depok,t,17.91,32.05,597,906,4512,-1,Small,5,3,8,0,0,Saturday,12,2020-10-17,Saturday_12,2e69ef4a4_2020-10-17_12,,,,,,,
14828,Dr Djundjunan,Bandung,t,5.23,11.03,1155,1280,1862,1,Small,5,4,22,0,0,Saturday,3,2020-10-24,Saturday_3,2e68e665c_2020-10-24_3,,,,,,,


In [29]:
df_gabung['speed_decrease_ratio']=df_gabung['speed']/df_gabung['regular_speed']
df_gabung.drop(['speed','regular_speed'],axis=1,inplace=True)

### Highway

In [30]:
true = list(df_gabung[df_gabung['is_highway']=='t']['street_x'].value_counts().index)
false = list(df_gabung[df_gabung['is_highway']=='f']['street_x'].value_counts().index)

In [31]:
df_gabung.loc[df_gabung['street_x'].isin(true),'is_highway']='t'
df_gabung.loc[df_gabung['street_x'].isin(false),'is_highway']='f'

In [32]:
df_gabung['is_highway'].fillna(df_gabung['is_highway'].mode()[0],inplace=True)

In [33]:
df_gabung['is_highway']=np.where(df_gabung['is_highway']=='t',1,0)

### Road Type

In [34]:
df_gabung['main_street']=np.where(df_gabung['road_type']=='1.0',1,0)

  res_values = method(rvalues)


### Type

In [35]:
df_gabung['badweather']=np.where(df_gabung['type_y']=='WEATHERHAZARD',1,0)
df_gabung['accident']=np.where(df_gabung['type_y']=='ACCIDENT',1,0)

### Relialibility

In [36]:
df_gabung['rely']=np.where((df_gabung['reliability']==5) | (df_gabung['reliability']==6),1,0)

### Report Rating

In [37]:
df_gabung['report']=np.where(df_gabung['report_rating']==0,1,0)

### Weekend

In [38]:
df_gabung['weekend']=np.where((df_gabung['hari_x']=='Friday') | (df_gabung['hari_x']=='Saturday') | (df_gabung['hari_x']=='Sunday'),1,0)

In [39]:
df_gabung.drop(['road_type','magvar','reliability','report_rating','confidence','type_y','subtype'],axis=1,inplace=True)

### Trend

In [40]:
df_gabung['trend_move']=np.where((df_gabung['trend']==1) | (df_gabung['trend']==-1),1,0)

### Type

In [41]:
df_gabung['irr_scale']=np.where((df_gabung['type_x']=='Medium') | (df_gabung['type_x']=='Small'),1,0)

### Jam Level

In [42]:
df_gabung['jam']=np.where((df_gabung['jam_level']==3) | (df_gabung['jam_level']==4),1,0)

### N-thumbs up

In [43]:
df_gabung['thumbs']=np.where(df_gabung['n_thumbs_up_x']==0,1,0)

### Alerts count

In [44]:
df_gabung['alerts']=np.where(df_gabung['alerts_count']==0,1,0)

### Busy Hours

In [45]:
df_gabung['busy_hour']=np.where((df_gabung['hour_x']==8) | (df_gabung['hour_x']==9) | (df_gabung['hour_x']==10) |(df_gabung['hour_x']==16) |(df_gabung['hour_x']==17) |(df_gabung['hour_x']==18) |(df_gabung['hour_x']==19) |(df_gabung['hour_x']==20),1,0)

In [46]:
df_gabung.drop(['delay_seconds','seconds','length','speed_decrease_ratio','severity','drivers_count','trend','type_x','jam_level','alerts_count','n_thumbs_up_x','hour_x','hari_x'],axis=1,inplace=True)

In [47]:
df_gabung.to_csv('almost_clean.csv',index=False)

In [48]:
# df_gabung.drop(['hari_x','hour_x','date_x','supergabungan'],axis=1,inplace=True)

In [49]:
df_gabung.isna().sum()

street_x         98
city_x            0
is_highway        0
date_x            0
day_hour_x        0
supergabungan     0
main_street       0
badweather        0
accident          0
rely              0
report            0
weekend           0
trend_move        0
irr_scale         0
jam               0
thumbs            0
alerts            0
busy_hour         0
dtype: int64

In [50]:
df_gabung['street_x'].fillna(df_gabung['street_x'].mode()[0], inplace=True)

In [51]:
# df_gabung.fillna(method='ffill', inplace=True)

In [52]:
df_gabung.nunique()

street_x           409
city_x              21
is_highway           2
date_x              84
day_hour_x         129
supergabungan    12450
main_street          1
badweather           2
accident             2
rely                 2
report               2
weekend              2
trend_move           2
irr_scale            2
jam                  2
thumbs               2
alerts               2
busy_hour            2
dtype: int64

In [53]:
df_gabung

Unnamed: 0,street_x,city_x,is_highway,date_x,day_hour_x,supergabungan,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,Dr Setiabudi,Bandung,1,2020-10-30,Friday_3,2e68e6cac_2020-10-30_3,0,0,0,1,0,1,1,0,1,1,1,0
1,Hankam Raya,Bekasi,0,2020-09-10,Thursday_10,2e6992a8c_2020-09-10_10,0,0,0,0,0,0,1,1,1,1,1,1
2,Jenderal Ibrahim Adjie,Bandung,0,2020-10-27,Tuesday_5,2e68e86bc_2020-10-27_5,0,0,0,1,1,0,1,1,1,1,1,0
3,Narogong Raya,Bekasi,1,2020-11-13,Friday_11,2e6992474_2020-11-13_11,0,0,0,0,0,1,1,1,1,1,1,0
4,Soekarno-Hatta (Jalur Lambat),Bandung,1,2020-10-08,Thursday_6,2e68e814c_2020-10-08_6,0,0,0,0,0,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14825,Exit 24: Cibitung/Kws Industri,Cibitung (Bekasi),0,2020-10-26,Monday_1,2e698fddc_2020-10-26_1,0,0,0,0,0,0,1,0,1,1,1,0
14826,Exit Baranangsiang,Bogor,0,2020-10-29,Thursday_4,2e69c5d94_2020-10-29_4,0,0,0,0,0,0,0,1,1,1,1,0
14827,N6 Cinangka Raya,Depok,1,2020-10-17,Saturday_12,2e69ef4a4_2020-10-17_12,0,0,0,0,0,1,1,1,1,1,1,0
14828,Dr Djundjunan,Bandung,1,2020-10-24,Saturday_3,2e68e665c_2020-10-24_3,0,0,0,0,0,1,1,1,1,1,1,0


In [54]:
df_gabung.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14830 entries, 0 to 14829
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   street_x       14830 non-null  object
 1   city_x         14830 non-null  object
 2   is_highway     14830 non-null  int32 
 3   date_x         14830 non-null  object
 4   day_hour_x     14830 non-null  object
 5   supergabungan  14830 non-null  object
 6   main_street    14830 non-null  int32 
 7   badweather     14830 non-null  int32 
 8   accident       14830 non-null  int32 
 9   rely           14830 non-null  int32 
 10  report         14830 non-null  int32 
 11  weekend        14830 non-null  int32 
 12  trend_move     14830 non-null  int32 
 13  irr_scale      14830 non-null  int32 
 14  jam            14830 non-null  int32 
 15  thumbs         14830 non-null  int32 
 16  alerts         14830 non-null  int32 
 17  busy_hour      14830 non-null  int32 
dtypes: int32(13), object(5)
me

In [55]:
df_gabung.drop_duplicates(subset='supergabungan',keep='last',inplace=True)

In [56]:
df_gabung.to_csv('siapdigabung.csv',index=False)

In [57]:
df_ready = pd.merge(train, df_gabung.drop_duplicates(subset=['day_hour_x'],keep='first'), how='left', on='day_hour_x' )
df_ready

Unnamed: 0,Labels,day_hour_x,street_x,city_x,is_highway,date_x,supergabungan,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,1,Tuesday_13,Narogong Raya,Bekasi,1.0,2020-11-10,2e699238c_2020-11-10_13,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1,Friday_17,Margonda Raya,Depok,1.0,2020-11-20,2e69ebfb4_2020-11-20_17,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,1,Sunday_19,,,,,,,,,,,,,,,,,
3,1,Saturday_15,Jalan Cibadak,Bandung,0.0,2020-09-05,2e68e618c_2020-09-05_15,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
4,1,Saturday_11,Cinere Raya,Depok,1.0,2020-10-10,2e69ee464_2020-10-10_11,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71331,0,Monday_10,Leuwinanggung,Depok,0.0,2020-09-21,2e69eb25c_2020-09-21_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
71332,1,Tuesday_12,Sawangan Raya,Depok,1.0,2020-10-27,2e69e93a4_2020-10-27_12,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
71333,1,Saturday_14,Dipati Ukur,Bandung,0.0,2020-11-07,2e68e6564_2020-11-07_14,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
71334,0,Wednesday_9,N11 Soekarno-Hatta,Bandung,1.0,2020-10-21,2e68e85d4_2020-10-21_9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [58]:
df_ready.nunique()

Labels             2
day_hour_x       168
street_x          68
city_x            11
is_highway         2
date_x            60
supergabungan    129
main_street        1
badweather         1
accident           1
rely               2
report             2
weekend            2
trend_move         2
irr_scale          2
jam                2
thumbs             2
alerts             2
busy_hour          2
dtype: int64

In [59]:
df_ready

Unnamed: 0,Labels,day_hour_x,street_x,city_x,is_highway,date_x,supergabungan,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,1,Tuesday_13,Narogong Raya,Bekasi,1.0,2020-11-10,2e699238c_2020-11-10_13,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1,Friday_17,Margonda Raya,Depok,1.0,2020-11-20,2e69ebfb4_2020-11-20_17,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,1,Sunday_19,,,,,,,,,,,,,,,,,
3,1,Saturday_15,Jalan Cibadak,Bandung,0.0,2020-09-05,2e68e618c_2020-09-05_15,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
4,1,Saturday_11,Cinere Raya,Depok,1.0,2020-10-10,2e69ee464_2020-10-10_11,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71331,0,Monday_10,Leuwinanggung,Depok,0.0,2020-09-21,2e69eb25c_2020-09-21_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
71332,1,Tuesday_12,Sawangan Raya,Depok,1.0,2020-10-27,2e69e93a4_2020-10-27_12,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
71333,1,Saturday_14,Dipati Ukur,Bandung,0.0,2020-11-07,2e68e6564_2020-11-07_14,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
71334,0,Wednesday_9,N11 Soekarno-Hatta,Bandung,1.0,2020-10-21,2e68e85d4_2020-10-21_9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [60]:
df_ready['Labels'].value_counts(normalize=True)

1    0.661419
0    0.338581
Name: Labels, dtype: float64

In [61]:
df_ready.corr()['Labels'].sort_values(ascending=False)

Labels         1.000000
weekend        0.088311
thumbs         0.036699
irr_scale      0.031885
alerts         0.012618
rely          -0.006925
jam           -0.006996
report        -0.010266
is_highway    -0.015074
trend_move    -0.018823
busy_hour     -0.036837
main_street         NaN
badweather          NaN
accident            NaN
Name: Labels, dtype: float64

In [62]:
df_ready.drop(['day_hour_x','date_x','supergabungan'],axis=1,inplace=True)


In [63]:
df_ready.reset_index(inplace=True,drop=True)

In [64]:
df_ready

Unnamed: 0,Labels,street_x,city_x,is_highway,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,1,Narogong Raya,Bekasi,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1,Margonda Raya,Depok,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,1,,,,,,,,,,,,,,,
3,1,Jalan Cibadak,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
4,1,Cinere Raya,Depok,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71331,0,Leuwinanggung,Depok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
71332,1,Sawangan Raya,Depok,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
71333,1,Dipati Ukur,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
71334,0,N11 Soekarno-Hatta,Bandung,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [65]:
df_ready.isna().sum()

Labels             0
street_x       25738
city_x         25738
is_highway     25738
main_street    25738
badweather     25738
accident       25738
rely           25738
report         25738
weekend        25738
trend_move     25738
irr_scale      25738
jam            25738
thumbs         25738
alerts         25738
busy_hour      25738
dtype: int64

In [67]:
df_ready['street_x'].fillna(df_ready['street_x'].mode()[0], inplace=True)
df_ready['city_x'].fillna(df_ready['city_x'].mode()[0], inplace=True)
df_ready['is_highway'].fillna(df_ready['is_highway'].mode()[0], inplace=True)
df_ready['main_street'].fillna(df_ready['main_street'].mode()[0], inplace=True)
df_ready['badweather'].fillna(df_ready['badweather'].mode()[0], inplace=True)
df_ready['accident'].fillna(df_ready['accident'].mode()[0], inplace=True)
df_ready['rely'].fillna(df_ready['rely'].mode()[0], inplace=True)
df_ready['report'].fillna(df_ready['report'].mode()[0], inplace=True)
df_ready['weekend'].fillna(df_ready['weekend'].mode()[0], inplace=True)
df_ready['trend_move'].fillna(df_ready['trend_move'].mode()[0], inplace=True)
df_ready['irr_scale'].fillna(df_ready['irr_scale'].mode()[0], inplace=True)
df_ready['jam'].fillna(df_ready['jam'].mode()[0], inplace=True)
df_ready['thumbs'].fillna(df_ready['thumbs'].mode()[0], inplace=True)
df_ready['alerts'].fillna(df_ready['alerts'].mode()[0], inplace=True)
df_ready['busy_hour'].fillna(df_ready['busy_hour'].mode()[0], inplace=True)

In [68]:
# df_ready.fillna(method='ffill', inplace=True)

In [69]:
df_ready.nunique()

Labels          2
street_x       68
city_x         11
is_highway      2
main_street     1
badweather      1
accident        1
rely            2
report          2
weekend         2
trend_move      2
irr_scale       2
jam             2
thumbs          2
alerts          2
busy_hour       2
dtype: int64

In [None]:
# onehot_pipeline = Pipeline([
#     ('onehot', OneHotEncoder(drop='first'))
# ])

# ordinal_pipeline = Pipeline([
#     ('ordinal', OrdinalEncoder())
# ])

# scaler_pipeline = Pipeline([
#     ('scaler', scaler)
# ])

# transformer = ColumnTransformer([
# #     ('imputer',SimpleImputer(strategy='most_frequent'),['street']),
#     ('scaler', scaler_pipeline, ['delay_seconds', 'length', 'speed delay','seconds']),
#     ('ordinal',ordinal_pipeline,['city','date','street','type','is_highway'])
# #     ('one_hot', onehot_pipeline, ['is_highway'])
# ],remainder='passthrough')


In [None]:
# transformer.transformers_

In [None]:
# haha = ['delay_seconds','length','speed_delay','seconds','city','date','street','type','is_highway','Labels','jam_level','trend','drive_count','alerts_count','n_thumbs_up','hour','is_weekend']

In [None]:
# df_clean_preprocessed = transformer.fit_transform(df_clean)
# df_clean_preprocessed = pd.DataFrame(df_clean_preprocessed)
# df_clean_preprocessed.columns = haha
# df_clean_preprocessed

In [None]:
# plt.figure(figsize=(30,15))
# plt.subplot(331)
# plt.title('delay_seconds')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['delay_seconds'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['delay_seconds'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(332)
# plt.title('length')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['length'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['length'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(333)
# plt.title('speed_delay')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['speed_delay'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['speed_delay'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(334)
# plt.title('city')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['city'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['city'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(335)
# plt.title('date')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['date'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['date'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(336)
# plt.title('street')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['street'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['street'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(337)
# plt.title('type')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['type'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['type'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(338)
# plt.title('is_highway')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['is_highway'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['is_highway'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(339)
# plt.title('jam_level')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['jam_level'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['jam_level'], label='Ngga Macet')
# plt.legend(loc= 'upper left')




# plt.show()

In [None]:
# plt.figure(figsize=(30,15))
# plt.subplot(231)
# plt.title('trend')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['trend'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['trend'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(232)
# plt.title('drive_count')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['drive_count'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['drive_count'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(233)
# plt.title('alerts_count')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['alerts_count'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['alerts_count'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(234)
# plt.title('n_thumbs_up')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['n_thumbs_up'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['n_thumbs_up'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(235)
# plt.title('hour')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['hour'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['hour'], label='Ngga Macet')
# plt.legend(loc= 'upper left')

# plt.subplot(236)
# plt.title('is_weekend')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 1]['is_weekend'], label='Macet')
# sns.distplot(df_clean_preprocessed[df_clean_preprocessed['Labels'] == 0]['is_weekend'], label='Ngga Macet')
# plt.legend(loc= 'upper left')


# plt.show()

In [None]:
# df_clean_preprocessed.corr()['Labels'].sort_values(ascending=False)

In [70]:
X = df_ready.drop(['Labels'], axis=1)
y = df_ready['Labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [71]:
X_train

Unnamed: 0,street_x,city_x,is_highway,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
25492,Jenderal Sudirman,Bekasi,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
49861,Dipati Ukur,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
65320,N1 RE Martadinata,Cikarang,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
51477,KH Noer Ali,Bekasi,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
54433,Trans Yogi,Bekasi,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37194,Dipati Ukur,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6265,N1 RE Martadinata,Cikarang,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
54886,Hankam Raya,Bekasi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
860,Dipati Ukur,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [72]:
rf = RandomForestClassifier(random_state=42)
xg = XGBClassifier(random_state=42)
logreg = LogisticRegression(random_state=42)
dtc = DecisionTreeClassifier(random_state=42)
nb = GaussianNB()
# scaler = MinMa()
smote = SMOTE(random_state=42)

In [73]:
transformer = ColumnTransformer([
#     ('one_hot', OneHotEncoder(), ['hari_x']),
    ('binary',ce.BinaryEncoder(),['street_x','city_x'])
],remainder='passthrough')


In [74]:
pipeline_rf = Pipeline([   
    ('transformer', transformer),
    ('smote',smote),
    ('clf', rf)
])

pipeline_xg = Pipeline([   
    ('transformer', transformer),
    ('smote',smote),
    ('clf', xg)
])

pipeline_logreg = Pipeline([   
    ('transformer', transformer),
    ('smote',smote),
    ('clf', logreg)
])

pipeline_dtc = Pipeline([   
    ('transformer', transformer),
    ('smote',smote),
    ('clf', dtc)
])

pipeline_nb = Pipeline([   
    ('transformer', transformer),
    ('smote',smote),
    ('clf', nb)
])

In [75]:
skf = StratifiedKFold(n_splits=5, random_state=42)

rf_score = cross_val_score(pipeline_rf, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1, verbose=1)
xg_score = cross_val_score(pipeline_xg, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1, verbose=1)
logreg_score = cross_val_score(pipeline_logreg, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1, verbose=1)
dtc_score = cross_val_score(pipeline_dtc, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1, verbose=1)
nb_score = cross_val_score(pipeline_nb, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   41.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   33.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   24.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   22.3s finished


In [76]:
print('RF Score:', rf_score)
print('XG Score:', xg_score)
print('Logreg Score:', logreg_score)
print('DTC Score:', dtc_score)
print('NB Score:', nb_score)

RF Score: [0.75465354 0.75349735 0.75498918 0.75169246 0.76045075 0.75565176
 0.76085656 0.75550181 0.76096859 0.7474772 ]
XG Score: [0.75474417 0.7523291  0.75498918 0.75027289 0.76151336 0.7555021
 0.76289398 0.75643038 0.76096859 0.7474772 ]
Logreg Score: [0.67293991 0.68853755 0.69767442 0.68885104 0.71389857 0.69926778
 0.69344609 0.69610661 0.69664972 0.67817313]
DTC Score: [0.75474417 0.75181598 0.75453999 0.75054545 0.76151336 0.7555609
 0.76004813 0.7566393  0.76096859 0.74741578]
NB Score: [0.76955603 0.78133537 0.79200086 0.77987147 0.78764396 0.78843029
 0.78632664 0.78876283 0.78703404 0.7807008 ]


In [77]:
print('RF Score:', rf_score.mean())
print('XG Score:', xg_score.mean())
print('Logreg Score:', logreg_score.mean())
print('DTC Score:', dtc_score.mean())
print('NB Score:', nb_score.mean())

RF Score: 0.7555739192820092
XG Score: 0.7557120969491857
Logreg Score: 0.6925544818232452
DTC Score: 0.7553791671205747
NB Score: 0.7841662278292703


## Tuning

In [78]:
pipeline_nb.get_params()

{'memory': None,
 'steps': [('transformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('binary', BinaryEncoder(),
                                    ['street_x', 'city_x'])])),
  ('smote', SMOTE(random_state=42)),
  ('clf', GaussianNB())],
 'verbose': False,
 'transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('binary', BinaryEncoder(),
                                  ['street_x', 'city_x'])]),
 'smote': SMOTE(random_state=42),
 'clf': GaussianNB(),
 'transformer__n_jobs': None,
 'transformer__remainder': 'passthrough',
 'transformer__sparse_threshold': 0.3,
 'transformer__transformer_weights': None,
 'transformer__transformers': [('binary',
   BinaryEncoder(),
   ['street_x', 'city_x'])],
 'transformer__verbose': False,
 'transformer__binary': BinaryEncoder(),
 'transformer__binary__cols': None,
 'transformer__binary__drop_invariant': False,
 'transformer__binary__handle_missing': 'value',
 'transformer_

In [79]:
hyperparam_space_nb = {
    'smote__k_neighbors':[1,5,10,30,50],
    'clf__priors':[(0.2,0.8),(0.3,0.7),(0.4,0.6)]
    
}


gridsearch_nb = GridSearchCV(pipeline_nb, hyperparam_space_nb, scoring='f1', n_jobs=-1, cv=skf,verbose=1)
gridsearch_nb.fit(X_train, y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.8min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('binary',
                                                                         BinaryEncoder(),
                                                                         ['street_x',
                                                                          'city_x'])])),
                                       ('smote', SMOTE(random_state=42)),
                                       ('clf', GaussianNB())]),
             n_jobs=-1,
             param_grid={'clf__priors': [(0.2, 0.8), (0.3, 0.7), (0.4, 0.6)],
                         'smote__k_neighbors': [1, 5, 10, 30, 50]},
             scoring='f1', verbose=1)

In [80]:
gridsearch_nb.best_params_

{'clf__priors': (0.2, 0.8), 'smote__k_neighbors': 10}

In [81]:
report_grid = pd.DataFrame(gridsearch_nb.cv_results_).sort_values('rank_test_score', ascending=True)
report_grid.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__priors,param_smote__k_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,25.566013,0.761555,0.034961,0.006276,"(0.2, 0.8)",10,"{'clf__priors': (0.2, 0.8), 'smote__k_neighbor...",0.780355,0.795847,0.793059,0.795614,0.795599,0.796408,0.796408,0.796494,0.795851,0.791782,0.793742,0.004703,1
1,26.210887,1.595076,0.037544,0.009947,"(0.2, 0.8)",5,"{'clf__priors': (0.2, 0.8), 'smote__k_neighbor...",0.78044,0.791698,0.796328,0.795845,0.791751,0.797039,0.796535,0.796494,0.79624,0.791782,0.793415,0.004811,2
6,25.114459,0.563287,0.033579,0.005553,"(0.3, 0.7)",5,"{'clf__priors': (0.3, 0.7), 'smote__k_neighbor...",0.778009,0.790508,0.796018,0.792473,0.791919,0.793269,0.796827,0.796494,0.795726,0.791782,0.792302,0.00522,3
0,27.07563,1.01962,0.039675,0.006837,"(0.2, 0.8)",1,"{'clf__priors': (0.2, 0.8), 'smote__k_neighbor...",0.78078,0.791698,0.796018,0.787069,0.791919,0.796616,0.788705,0.788197,0.79631,0.796946,0.791426,0.005028,4
4,32.567705,0.991971,0.039376,0.006943,"(0.2, 0.8)",50,"{'clf__priors': (0.2, 0.8), 'smote__k_neighbor...",0.779676,0.793948,0.795897,0.780456,0.78971,0.793269,0.797161,0.795044,0.793228,0.787076,0.790547,0.005929,5


In [82]:
gridsearch_nb.best_estimator_

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', BinaryEncoder(),
                                                  ['street_x', 'city_x'])])),
                ('smote', SMOTE(k_neighbors=10, random_state=42)),
                ('clf', GaussianNB(priors=(0.2, 0.8)))])

In [83]:
nb = pipeline_nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test) 
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.41      0.06      0.11      4811
           1       0.67      0.95      0.78      9457

    accuracy                           0.65     14268
   macro avg       0.54      0.51      0.45     14268
weighted avg       0.58      0.65      0.56     14268



In [84]:
f1_score(y_test,y_pred_nb)

0.7849279902536658

In [85]:
nb_tuned = gridsearch_nb.best_estimator_.fit(X_train, y_train)
y_pred_nb_tuned = nb_tuned.predict(X_test)
print(classification_report(y_test, y_pred_nb_tuned))

              precision    recall  f1-score   support

           0       0.59      0.01      0.02      4811
           1       0.66      1.00      0.80      9457

    accuracy                           0.66     14268
   macro avg       0.63      0.50      0.41     14268
weighted avg       0.64      0.66      0.54     14268



In [119]:
y_pred_nb_tuned

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [86]:
f1_score(y_test,y_pred_nb_tuned)

0.7971382609431885

In [87]:
gridsearch_nb.best_estimator_.fit(X,y)

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', BinaryEncoder(),
                                                  ['street_x', 'city_x'])])),
                ('smote', SMOTE(k_neighbors=10, random_state=42)),
                ('clf', GaussianNB(priors=(0.2, 0.8)))])

In [88]:
gridsearch_nb.best_estimator_.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', BinaryEncoder(),
                                                  ['street_x', 'city_x'])])),
                ('smote', SMOTE(k_neighbors=10, random_state=42)),
                ('clf', GaussianNB(priors=(0.2, 0.8)))])

### Tuning RF

In [151]:
pipeline_rf.get_params()

{'memory': None,
 'steps': [('transformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('binary', BinaryEncoder(),
                                    ['street_x', 'city_x'])])),
  ('smote', SMOTE(random_state=42)),
  ('clf', RandomForestClassifier(random_state=42))],
 'verbose': False,
 'transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('binary', BinaryEncoder(),
                                  ['street_x', 'city_x'])]),
 'smote': SMOTE(random_state=42),
 'clf': RandomForestClassifier(random_state=42),
 'transformer__n_jobs': None,
 'transformer__remainder': 'passthrough',
 'transformer__sparse_threshold': 0.3,
 'transformer__transformer_weights': None,
 'transformer__transformers': [('binary',
   BinaryEncoder(),
   ['street_x', 'city_x'])],
 'transformer__verbose': False,
 'transformer__binary': BinaryEncoder(),
 'transformer__binary__cols': None,
 'transformer__binary__drop_invariant': False,
 'transfo

In [152]:
hyperparam_space_rf = {
    'clf__max_depth':[15],
    'smote__k_neighbors':[1,5,10]
#     'clf__n_estimators':[10,30,50,100,150]
}


gridsearch_rf = GridSearchCV(pipeline_rf, hyperparam_space_rf, scoring='f1', n_jobs=-1, cv=skf,verbose=1)
gridsearch_rf.fit(X_train, y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 15.5min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('binary',
                                                                         BinaryEncoder(),
                                                                         ['street_x',
                                                                          'city_x'])])),
                                       ('smote', SMOTE(random_state=42)),
                                       ('clf',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': [1, 8, 15, 22, 30],
                         'smote__k_neighbors': [1, 5, 10, 30, 50]},
             scoring='f1', verbose=1)

In [153]:
gridsearch_rf.best_params_

{'clf__max_depth': 15, 'smote__k_neighbors': 10}

In [154]:
report_grid = pd.DataFrame(gridsearch_rf.cv_results_).sort_values('rank_test_score', ascending=True)
report_grid.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,param_smote__k_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
12,44.565062,0.990868,0.180305,0.023985,15,10,"{'clf__max_depth': 15, 'smote__k_neighbors': 10}",0.756802,0.751238,0.758315,0.748448,0.763601,0.757842,0.762416,0.754862,0.765629,0.753159,0.757231,0.005244,1
22,49.053557,1.65265,0.219565,0.098995,30,10,"{'clf__max_depth': 30, 'smote__k_neighbors': 10}",0.756802,0.751238,0.758315,0.748448,0.763601,0.757842,0.762416,0.754862,0.765629,0.753159,0.757231,0.005244,1
17,44.795512,1.171391,0.184159,0.054118,22,10,"{'clf__max_depth': 22, 'smote__k_neighbors': 10}",0.756802,0.751238,0.758315,0.748448,0.763601,0.757842,0.762416,0.754862,0.765629,0.753159,0.757231,0.005244,1
21,46.031745,2.767533,0.171715,0.073993,30,5,"{'clf__max_depth': 30, 'smote__k_neighbors': 5}",0.754654,0.753497,0.754989,0.751692,0.760451,0.755652,0.760857,0.755502,0.760969,0.747477,0.755574,0.004086,4
16,43.941946,0.877936,0.174213,0.040325,22,5,"{'clf__max_depth': 22, 'smote__k_neighbors': 5}",0.754654,0.753497,0.754989,0.751692,0.760451,0.755652,0.760857,0.755502,0.760969,0.747477,0.755574,0.004086,4


### Tuning DTC

In [160]:
pipeline_dtc.get_params()

{'memory': None,
 'steps': [('transformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('binary', BinaryEncoder(),
                                    ['street_x', 'city_x'])])),
  ('smote', SMOTE(random_state=42)),
  ('clf', DecisionTreeClassifier(random_state=42))],
 'verbose': False,
 'transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('binary', BinaryEncoder(),
                                  ['street_x', 'city_x'])]),
 'smote': SMOTE(random_state=42),
 'clf': DecisionTreeClassifier(random_state=42),
 'transformer__n_jobs': None,
 'transformer__remainder': 'passthrough',
 'transformer__sparse_threshold': 0.3,
 'transformer__transformer_weights': None,
 'transformer__transformers': [('binary',
   BinaryEncoder(),
   ['street_x', 'city_x'])],
 'transformer__verbose': False,
 'transformer__binary': BinaryEncoder(),
 'transformer__binary__cols': None,
 'transformer__binary__drop_invariant': False,
 'transfo

In [161]:
hyperparam_space_dtc = {
    'clf__max_depth':[1,10,20,30],
    'smote__k_neighbors':[1,5,10,20,30]
#     'clf__n_estimators':[10,30,50,100,150]
}


gridsearch_dtc = GridSearchCV(pipeline_dtc, hyperparam_space_dtc, scoring='f1', n_jobs=-1, cv=skf,verbose=1)
gridsearch_dtc.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  7.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('binary',
                                                                         BinaryEncoder(),
                                                                         ['street_x',
                                                                          'city_x'])])),
                                       ('smote', SMOTE(random_state=42)),
                                       ('clf',
                                        DecisionTreeClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': [1, 10, 20, 30],
                         'smote__k_neighbors': [1, 5, 10, 20, 30]},
             scoring='f1', verbose=1)

In [162]:
gridsearch_dtc.best_params_

{'clf__max_depth': 20, 'smote__k_neighbors': 10}

In [163]:
report_grid = pd.DataFrame(gridsearch_dtc.cv_results_).sort_values('rank_test_score', ascending=True)
report_grid.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,param_smote__k_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
17,27.006821,0.903753,0.031799,0.003968,30,10,"{'clf__max_depth': 30, 'smote__k_neighbors': 10}",0.756802,0.751238,0.75603,0.748448,0.763453,0.757783,0.760867,0.754803,0.765775,0.752592,0.756779,0.005151,1
12,26.871756,0.927014,0.03493,0.003521,20,10,"{'clf__max_depth': 20, 'smote__k_neighbors': 10}",0.756802,0.751238,0.75603,0.748448,0.763453,0.757783,0.760867,0.754803,0.765775,0.752592,0.756779,0.005151,1
13,27.814845,1.467403,0.033609,0.007765,20,20,"{'clf__max_depth': 20, 'smote__k_neighbors': 20}",0.751931,0.748294,0.754059,0.749057,0.767648,0.754094,0.762075,0.759312,0.76096,0.754622,0.756205,0.005841,3
18,27.782375,1.225974,0.032327,0.005667,30,20,"{'clf__max_depth': 30, 'smote__k_neighbors': 20}",0.751931,0.748294,0.754059,0.749057,0.767648,0.754094,0.762075,0.759312,0.76096,0.754622,0.756205,0.005841,3
16,26.994292,1.09176,0.035207,0.004968,30,5,"{'clf__max_depth': 30, 'smote__k_neighbors': 5}",0.754744,0.751816,0.75454,0.750545,0.761513,0.755561,0.760048,0.756639,0.760969,0.747416,0.755379,0.004392,5


In [89]:
import joblib

In [90]:
joblib.dump(nb_tuned,'nb_macet2')

['nb_macet2']

### Predict new data

In [91]:
test = pd.read_csv('data_test.csv')
test['s2'] = test['Ids'].apply(lambda x: x.split('_')[0])
test['date'] = test['Ids'].apply(lambda x: x.split('_')[1])
test['date'] = test['date'].apply(lambda x: pd.to_datetime(x))
test['hari'] = test['date'].apply(lambda x: x.day_name())
test['hour'] = test['Ids'].apply(lambda x: x.split('_')[-1])
test['day_hour_x'] = [label(row[0], row[1]) for row in zip(test['hari'], test['hour'])]
test['id_day'] = [label(row[0], row[1]) for row in zip(test['s2'], test['hari'])]
test['id_day_hour'] = [label(row[0], row[1]) for row in zip(test['id_day'], test['hour'])]
test.head()

Unnamed: 0,Ids,s2,date,hari,hour,day_hour_x,id_day,id_day_hour
0,2e6992a84_2020-11-25_18,2e6992a84,2020-11-25,Wednesday,18,Wednesday_18,2e6992a84_Wednesday,2e6992a84_Wednesday_18
1,2e68e62f4_2020-11-29_20,2e68e62f4,2020-11-29,Sunday,20,Sunday_20,2e68e62f4_Sunday,2e68e62f4_Sunday_20
2,2e68e81a4_2020-11-27_10,2e68e81a4,2020-11-27,Friday,10,Friday_10,2e68e81a4_Friday,2e68e81a4_Friday_10
3,2e69eec04_2020-11-24_7,2e69eec04,2020-11-24,Tuesday,7,Tuesday_7,2e69eec04_Tuesday,2e69eec04_Tuesday_7
4,2e698e4a4_2020-11-27_8,2e698e4a4,2020-11-27,Friday,8,Friday_8,2e698e4a4_Friday,2e698e4a4_Friday_8


In [92]:
test.drop(['s2', 'date', 'hour','Ids','id_day','hari','id_day','id_day_hour'], axis=1, inplace=True)

In [93]:
test.nunique()

day_hour_x    161
dtype: int64

In [94]:
test.head()

Unnamed: 0,day_hour_x
0,Wednesday_18
1,Sunday_20
2,Friday_10
3,Tuesday_7
4,Friday_8


In [95]:
df2_ready = pd.merge(test, df_gabung.drop_duplicates(subset=['day_hour_x'],keep='first'), how='left', on='day_hour_x' )
df2_ready

Unnamed: 0,day_hour_x,street_x,city_x,is_highway,date_x,supergabungan,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,Wednesday_18,,,,,,,,,,,,,,,,,
1,Sunday_20,N1 RE Martadinata,Cikarang,1.0,2020-11-01,2e6984e2c_2020-11-01_20,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2,Friday_10,Sawangan Raya,Depok,1.0,2020-09-04,2e69e946c_2020-09-04_10,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Tuesday_7,Kemakmuran,Bekasi,0.0,2020-09-01,2e698c3ac_2020-09-01_7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4,Friday_8,Cibaduyut Raya,Bandung,0.0,2020-10-16,2e68e8c34_2020-10-16_8,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13836,Thursday_5,LLRE Martadinata,Bandung,0.0,2020-10-29,2e68e7cbc_2020-10-29_5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
13837,Tuesday_22,Exit 24: Cibitung/Kws Industri,Cibitung (Bekasi),0.0,2020-09-08,2e698fe74_2020-09-08_22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
13838,Tuesday_10,N8 Jalan Raya Bogor,Bogor,1.0,2020-11-17,2e69c6abc_2020-11-17_10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
13839,Tuesday_18,,,,,,,,,,,,,,,,,


In [96]:
df2_ready.nunique()

day_hour_x       161
street_x          65
city_x            11
is_highway         2
date_x            60
supergabungan    122
main_street        1
badweather         1
accident           1
rely               2
report             2
weekend            2
trend_move         2
irr_scale          2
jam                2
thumbs             2
alerts             2
busy_hour          2
dtype: int64

In [97]:
df2_ready.drop(['day_hour_x','date_x','supergabungan'],axis=1,inplace=True)


In [98]:
df2_ready.reset_index(inplace=True,drop=True)

In [99]:
df2_ready

Unnamed: 0,street_x,city_x,is_highway,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,,,,,,,,,,,,,,,
1,N1 RE Martadinata,Cikarang,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2,Sawangan Raya,Depok,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Kemakmuran,Bekasi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4,Cibaduyut Raya,Bandung,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13836,LLRE Martadinata,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
13837,Exit 24: Cibitung/Kws Industri,Cibitung (Bekasi),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
13838,N8 Jalan Raya Bogor,Bogor,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
13839,,,,,,,,,,,,,,,


In [100]:
df2_ready.isna().sum()/len(df2_ready)*100

street_x       34.340004
city_x         34.340004
is_highway     34.340004
main_street    34.340004
badweather     34.340004
accident       34.340004
rely           34.340004
report         34.340004
weekend        34.340004
trend_move     34.340004
irr_scale      34.340004
jam            34.340004
thumbs         34.340004
alerts         34.340004
busy_hour      34.340004
dtype: float64

In [103]:
df2_ready['street_x'].fillna(df2_ready['street_x'].mode()[0], inplace=True)
df2_ready['city_x'].fillna(df2_ready['city_x'].mode()[0], inplace=True)
df2_ready['is_highway'].fillna(df2_ready['is_highway'].mode()[0], inplace=True)
df2_ready['main_street'].fillna(df2_ready['main_street'].mode()[0], inplace=True)
df2_ready['badweather'].fillna(df2_ready['badweather'].mode()[0], inplace=True)
df2_ready['accident'].fillna(df2_ready['accident'].mode()[0], inplace=True)
df2_ready['rely'].fillna(df2_ready['rely'].mode()[0], inplace=True)
df2_ready['report'].fillna(df2_ready['report'].mode()[0], inplace=True)
df2_ready['weekend'].fillna(df2_ready['weekend'].mode()[0], inplace=True)
df2_ready['trend_move'].fillna(df2_ready['trend_move'].mode()[0], inplace=True)
df2_ready['irr_scale'].fillna(df2_ready['irr_scale'].mode()[0], inplace=True)
df2_ready['jam'].fillna(df2_ready['jam'].mode()[0], inplace=True)
df2_ready['thumbs'].fillna(df2_ready['thumbs'].mode()[0], inplace=True)
df2_ready['alerts'].fillna(df2_ready['alerts'].mode()[0], inplace=True)
df2_ready['busy_hour'].fillna(df2_ready['busy_hour'].mode()[0], inplace=True)

In [None]:
# is_NaN = df2_ready.isnull()
# row_has_NaN = is_NaN.any(axis=1)
# rows_with_NaN = df2_ready[row_has_NaN]
# rows_with_NaN

In [None]:
# df2_ready.fillna(method='bfill', inplace=True)

In [None]:
# df2_ready.fillna(method='ffill', inplace=True)

In [None]:
# df2_ready.fillna(method='bfill', inplace=True)

In [104]:
df2_ready.isna().sum()/len(df2_ready)*100

street_x       0.0
city_x         0.0
is_highway     0.0
main_street    0.0
badweather     0.0
accident       0.0
rely           0.0
report         0.0
weekend        0.0
trend_move     0.0
irr_scale      0.0
jam            0.0
thumbs         0.0
alerts         0.0
busy_hour      0.0
dtype: float64

In [105]:
df2_ready

Unnamed: 0,street_x,city_x,is_highway,main_street,badweather,accident,rely,report,weekend,trend_move,irr_scale,jam,thumbs,alerts,busy_hour
0,Sawangan Raya,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,N1 RE Martadinata,Cikarang,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2,Sawangan Raya,Depok,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Kemakmuran,Bekasi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4,Cibaduyut Raya,Bandung,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13836,LLRE Martadinata,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
13837,Exit 24: Cibitung/Kws Industri,Cibitung (Bekasi),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
13838,N8 Jalan Raya Bogor,Bogor,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
13839,Sawangan Raya,Bandung,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [133]:
y_pred_test_nb = gridsearch_nb.best_estimator_.predict(df2_ready)

In [131]:
pipeline_xg.fit(X,y)
y_pred_test_xg = pipeline_xg.predict(df2_ready)





In [155]:
gridsearch_rf.best_estimator_.fit(X,y)
y_pred_test_rf = gridsearch_rf.best_estimator_.predict(df2_ready)

In [164]:
gridsearch_dtc.best_estimator_.fit(X,y)
y_pred_test_dtc = gridsearch_dtc.best_estimator_.predict(df2_ready)

In [165]:
hasil = pd.read_csv('data_test.csv')
hasil['NB']=y_pred_test_nb
hasil['XG']=y_pred_test_xg
hasil['RF']=y_pred_test_rf
hasil['DTC']=y_pred_test_dtc

In [166]:
hasil['NB'].value_counts(normalize=True)

1    0.972401
0    0.027599
Name: NB, dtype: float64

In [167]:
hasil['XG'].value_counts(normalize=True)

1    0.725237
0    0.274763
Name: XG, dtype: float64

In [168]:
hasil['RF'].value_counts(normalize=True)

1    0.719962
0    0.280038
Name: RF, dtype: float64

In [169]:
hasil['DTC'].value_counts(normalize=True)

1    0.711293
0    0.288707
Name: DTC, dtype: float64

In [138]:
hasil['NB'].replace({
    1:True,
    0:False
}, inplace=True)

In [139]:
hasil['XG'].replace({
    1:True,
    0:False
}, inplace=True)

In [140]:
hasil['RF'].replace({
    1:True,
    0:False
}, inplace=True)

In [141]:
hasil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13841 entries, 0 to 13840
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Ids     13841 non-null  object
 1   NB      13841 non-null  bool  
 2   XG      13841 non-null  bool  
 3   RF      13841 non-null  bool  
dtypes: bool(3), object(1)
memory usage: 148.8+ KB


In [145]:
pd.set_option('display.max_rows',1000)

In [148]:
hasil['highest']=gede['Labels']

In [150]:
hasil['highest'].value_counts(normalize=True)

True     0.653999
False    0.346001
Name: highest, dtype: float64

In [172]:
hasil

Unnamed: 0,Ids,NB,XG,Labels,DTC
0,2e6992a84_2020-11-25_18,1,1,1,1
1,2e68e62f4_2020-11-29_20,1,0,0,0
2,2e68e81a4_2020-11-27_10,1,0,0,0
3,2e69eec04_2020-11-24_7,1,1,1,1
4,2e698e4a4_2020-11-27_8,1,1,1,1
...,...,...,...,...,...
13836,2e68dd414_2020-11-26_5,1,0,0,0
13837,2e698541c_2020-11-24_22,1,1,1,0
13838,2e69e8e0c_2020-11-24_10,1,0,0,0
13839,2e699a1cc_2020-11-24_18,1,1,1,1


In [170]:
hasil.rename({
    'RF':'Labels'
},axis=1,inplace=True)

In [173]:
hasil.drop(['NB','XG','DTC'],axis=1,inplace=True)

In [174]:
hasil['Labels'].replace({
    1:True,
    0:False
}, inplace=True)

In [175]:
hasil

Unnamed: 0,Ids,Labels
0,2e6992a84_2020-11-25_18,True
1,2e68e62f4_2020-11-29_20,False
2,2e68e81a4_2020-11-27_10,False
3,2e69eec04_2020-11-24_7,True
4,2e698e4a4_2020-11-27_8,True
...,...,...
13836,2e68dd414_2020-11-26_5,False
13837,2e698541c_2020-11-24_22,True
13838,2e69e8e0c_2020-11-24_10,False
13839,2e699a1cc_2020-11-24_18,True


In [176]:
hasil.to_csv('submission_jo3.csv', index=False)