In [47]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Descriptive analysis

In [73]:
data = pd.read_csv(r'E:\Data science\Model_compress\Model\Traffic.csv').drop(['delay_min_per_vehicle', 'Ships_per_day'] , axis=1)

In [74]:
data.describe(include = 'all')

Unnamed: 0,DateTime,Bridge_Type,Bridge_Name,ship_present,Closure_min,vehicle_flow_veh_hr
count,1000,1000,1000,1000,1000.0,1000.0
unique,1000,3,5,2,,
top,2025-05-01 06:51,Floating,Martyr-El-Sayed,YES,,
freq,1,414,209,774,,
mean,,,,,24.534,3249.631
std,,,,,20.029929,302.00851
min,,,,,0.0,2233.0
25%,,,,,13.0,3053.75
50%,,,,,20.0,3229.0
75%,,,,,37.0,3452.0


In [75]:
# Sperate time from date
data['DateTime'] = pd.to_datetime(data['DateTime'])
hour = data['DateTime'].dt.hour
data['Date'] = pd.to_datetime(data['DateTime'].dt.date)
data.drop('DateTime' , axis =1 , inplace=True )

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Bridge_Type          1000 non-null   object        
 1   Bridge_Name          1000 non-null   object        
 2   ship_present         1000 non-null   object        
 3   Closure_min          1000 non-null   int64         
 4   vehicle_flow_veh_hr  1000 non-null   int64         
 5   Date                 1000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 47.0+ KB


In [77]:
data['Date'] = pd.to_datetime(data['Date'])

print(f'Min day: {data['Date'].min()}')
print(f'Max day: {data['Date'].max()}')

Min day: 2025-05-01 00:00:00
Max day: 2028-01-25 00:00:00


In [78]:
### Grouping the time to four groups Morning, Afternoon, Evening, Night
def period_of_day(h):
    if 5 <= h < 12: return "Morning"
    elif 12 <= h < 17: return "Afternoon"
    elif 17 <= h < 21: return "Evening"
    else: return "Night"

data['period_of_day'] =  hour.apply(period_of_day)
data['rush_hour'] = hour.apply(lambda h: 1 if h in [7, 8, 17, 18] else 0)

In [81]:
numerical_data = data.select_dtypes('number').drop('rush_hour',axis = 1).columns
numerical_data

Index(['Closure_min', 'vehicle_flow_veh_hr'], dtype='object')

In [82]:
categorical_data = data.select_dtypes('O').drop(['Bridge_Name' , 'ship_present' ] , axis = 1).columns
categorical_data

Index(['Bridge_Type', 'period_of_day'], dtype='object')

In [83]:
preprocessor = ColumnTransformer(
 [
     ('numircal' , StandardScaler() , numerical_data),
     ('categorical' , OneHotEncoder() , categorical_data)
 ]   
)

In [84]:
pipeline = Pipeline(
    [
        ('preprocessor' , preprocessor),
        ('model', RandomForestClassifier(random_state=42 , class_weight='balanced'))
    ]
)

In [85]:
X = data.drop(['Bridge_Name' , 'ship_present' , 'Date' ], axis = 1) 
y = pd.get_dummies(data['ship_present'] , dtype=int)['YES']


In [86]:
X

Unnamed: 0,Bridge_Type,Closure_min,vehicle_flow_veh_hr,period_of_day,rush_hour
0,Pontoon,0,3392,Morning,0
1,Floating,22,3367,Evening,1
2,Pontoon,15,3256,Evening,0
3,Pontoon,17,3061,Morning,0
4,Swing,0,3429,Night,0
...,...,...,...,...,...
995,Pontoon,44,3340,Afternoon,0
996,Floating,42,3458,Afternoon,0
997,Pontoon,16,3392,Night,0
998,Floating,0,3337,Morning,1


In [87]:
y

0      0
1      1
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: YES, Length: 1000, dtype: int64

In [88]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 ,
                                                      stratify=y)

from sklearn.model_selection import cross_val_score
# Checking the overfitting or underfitting

cross_val_score(pipeline , X_train , y_train)

array([1., 1., 1., 1., 1.])

In [89]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numircal', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [90]:
y_pred = pipeline.predict(X_test)

y_pred

array([1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [91]:
pipeline.named_steps['preprocessor'].get_feature_names_out()

array(['numircal__Closure_min', 'numircal__vehicle_flow_veh_hr',
       'categorical__Bridge_Type_Floating',
       'categorical__Bridge_Type_Pontoon',
       'categorical__Bridge_Type_Swing',
       'categorical__period_of_day_Afternoon',
       'categorical__period_of_day_Evening',
       'categorical__period_of_day_Morning',
       'categorical__period_of_day_Night'], dtype=object)

In [92]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test , y_pred)

1.0

In [93]:
X_test

Unnamed: 0,Bridge_Type,Closure_min,vehicle_flow_veh_hr,period_of_day,rush_hour
29,Floating,18,3135,Evening,1
301,Pontoon,0,3168,Evening,1
864,Pontoon,0,3653,Night,0
291,Floating,20,2738,Morning,0
184,Pontoon,56,3263,Night,0
...,...,...,...,...,...
372,Pontoon,38,3405,Morning,0
798,Pontoon,19,3682,Evening,1
309,Pontoon,18,3499,Morning,0
602,Pontoon,22,3265,Afternoon,0


In [94]:
import joblib 

joblib.dump(pipeline , r'E:\Data science\Model_compress\Model\Finalmodel.pki')

['E:\\Data science\\Model_compress\\Model\\Finalmodel.pki']

In [95]:
X_test.columns

Index(['Bridge_Type', 'Closure_min', 'vehicle_flow_veh_hr', 'period_of_day',
       'rush_hour'],
      dtype='object')