In [295]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Descriptive analysis

In [296]:
data = pd.read_csv(r'E:\Data science\Model_compress\Model\Traffic.csv')

In [297]:
data.describe(include = 'all')

Unnamed: 0,DateTime,Bridge_Type,Bridge_Name,Ships_per_day,ship_present,Closure_min,vehicle_flow_veh_hr,delay_min_per_vehicle,period_of_day,rush_hour
count,1000,1000,1000,1000.0,1000,1000.0,1000.0,1000.0,1000,1000
unique,1000,3,5,,2,,,,6,2
top,2025-05-01 06:51,Pontoon,Martyr-El-Sayed,,YES,,,,Night,NO
freq,1,402,210,,670,,,,320,885
mean,,,,1.223,,22.101,3249.906,0.012253,,
std,,,,1.192767,,21.885469,288.888799,0.012138,,
min,,,,0.0,,0.0,2308.0,-0.01,,
25%,,,,0.0,,0.0,3059.75,0.0,,
50%,,,,1.0,,18.0,3238.5,0.012,,
75%,,,,2.0,,36.0,3441.0,0.022,,


In [298]:
# Sperate time from date
data['DateTime'] = pd.to_datetime(data['DateTime'])
hour = data['DateTime'].dt.hour
data['Date'] = pd.to_datetime(data['DateTime'].dt.date)
data.drop('DateTime' , axis =1 , inplace=True )

In [299]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Bridge_Type            1000 non-null   object        
 1   Bridge_Name            1000 non-null   object        
 2   Ships_per_day          1000 non-null   int64         
 3   ship_present           1000 non-null   object        
 4   Closure_min            1000 non-null   int64         
 5   vehicle_flow_veh_hr    1000 non-null   int64         
 6   delay_min_per_vehicle  1000 non-null   float64       
 7   period_of_day          1000 non-null   object        
 8   rush_hour              1000 non-null   object        
 9   Date                   1000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 78.3+ KB


In [300]:
data['Date'] = pd.to_datetime(data['Date'])

print(f'Min day: {data['Date'].min()}')
print(f'Max day: {data['Date'].max()}')

Min day: 2025-05-01 00:00:00
Max day: 2028-01-25 00:00:00


In [301]:
### Grouping the time to four groups Morning, Afternoon, Evening, Night
def period_of_day(h):
    if 5 <= h < 12: return "Morning"
    elif 12 <= h < 17: return "Afternoon"
    elif 17 <= h < 21: return "Evening"
    else: return "Night"

data['period_of_day'] =  hour.apply(period_of_day)
data['rush_hour'] = hour.apply(lambda h: 1 if h in [7, 8, 17, 18] else 0)

In [302]:
numerical_data = data.select_dtypes('number').drop('rush_hour',axis = 1).columns
numerical_data

Index(['Ships_per_day', 'Closure_min', 'vehicle_flow_veh_hr',
       'delay_min_per_vehicle'],
      dtype='object')

In [303]:
zero_fix_cols = ['delay_min_per_vehicle', 'vehicle_flow_veh_hr', 'Ships_per_day', 'Closure_min']

from sklearn.base import BaseEstimator, TransformerMixin

class ZeroToMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        self.means_ = {}
    
    def fit(self, X, y=None):
        X = X.copy()
        for col in self.cols:
            non_zero = X[col].replace(0, np.nan)
            self.means_[col] = non_zero.mean()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols:
            X[col] = X[col].replace(0, np.nan)
            X[col] = X[col].fillna(self.means_[col])
        return X
    def get_feature_names_out(self , input_features = None):
        return np.array(
            self.cols if input_features is None else input_features
        )

numerical_pipeline = Pipeline(
    [
        ('zero_to_fix' , ZeroToMeanImputer(zero_fix_cols)),
        ('scaler' , StandardScaler())
    ]
)

In [304]:
categorical_data = data.select_dtypes('O').drop(['Bridge_Name' , 'ship_present' ] , axis = 1).columns
categorical_data

Index(['Bridge_Type', 'period_of_day'], dtype='object')

In [305]:
preprocessor = ColumnTransformer(
 [
     ('numerical' , numerical_pipeline , numerical_data),
     ('categorical' , OneHotEncoder() , categorical_data)
 ]   
)

In [306]:
pipeline = Pipeline(
    [
        ('preprocessor' , preprocessor),
        ('model', SVC(random_state=42 , class_weight='balanced' , probability=True))
    ]
)

In [307]:
X = data.drop(['Bridge_Name' , 'ship_present' , 'Date' ], axis = 1) 
y = pd.get_dummies(data['ship_present'] , dtype=int)['YES']


In [308]:
X

Unnamed: 0,Bridge_Type,Ships_per_day,Closure_min,vehicle_flow_veh_hr,delay_min_per_vehicle,period_of_day,rush_hour
0,Pontoon,1,16,3221,0.009,Morning,0
1,Pontoon,0,0,3092,0.000,Night,0
2,Pontoon,0,0,3081,0.000,Morning,0
3,Floating,2,35,3241,-0.002,Afternoon,0
4,Pontoon,0,0,3534,0.000,Evening,0
...,...,...,...,...,...,...,...
995,Pontoon,2,27,3601,0.011,Afternoon,0
996,Swing,2,27,3191,0.029,Evening,1
997,Floating,3,61,2670,0.046,Morning,0
998,Swing,1,20,3532,0.011,Night,0


In [309]:
X.columns

Index(['Bridge_Type', 'Ships_per_day', 'Closure_min', 'vehicle_flow_veh_hr',
       'delay_min_per_vehicle', 'period_of_day', 'rush_hour'],
      dtype='object')

In [310]:
y

0      1
1      0
2      0
3      1
4      0
      ..
995    1
996    1
997    1
998    1
999    0
Name: YES, Length: 1000, dtype: int64

In [311]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 ,
                                                      stratify=y)

from sklearn.model_selection import cross_val_score
# Checking the overfitting or underfitting

cross_val_score(pipeline , X_train , y_train)

array([0.9375 , 0.88125, 0.8875 , 0.9375 , 0.93125])

In [312]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,cols,"['delay_min_per_vehicle', 'vehicle_flow_veh_hr', ...]"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [332]:
y_pred = pipeline.predict(X_test)

y_pred

array([1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1])

In [333]:
X_test.reset_index()

Unnamed: 0,index,Bridge_Type,Ships_per_day,Closure_min,vehicle_flow_veh_hr,delay_min_per_vehicle,period_of_day,rush_hour
0,257,Pontoon,1,19,3000,0.034,Morning,0
1,774,Floating,1,17,3088,0.031,Morning,1
2,13,Pontoon,0,0,3244,0.000,Night,0
3,75,Floating,0,0,3813,0.000,Night,0
4,735,Floating,1,18,3125,0.003,Night,0
...,...,...,...,...,...,...,...,...
195,964,Pontoon,0,0,3275,0.000,Night,0
196,442,Swing,2,38,3290,0.030,Night,0
197,341,Pontoon,0,0,3136,0.000,Night,0
198,127,Floating,1,22,3197,0.011,Evening,1


In [334]:
X_test[y_pred == 0]

Unnamed: 0,Bridge_Type,Ships_per_day,Closure_min,vehicle_flow_veh_hr,delay_min_per_vehicle,period_of_day,rush_hour
13,Pontoon,0,0,3244,0.000,Night,0
75,Floating,0,0,3813,0.000,Night,0
753,Pontoon,0,0,3614,0.000,Night,0
994,Floating,2,28,3169,0.019,Afternoon,0
140,Swing,2,37,3338,0.017,Morning,1
...,...,...,...,...,...,...,...
371,Pontoon,0,0,3479,0.000,Night,0
330,Floating,0,0,3584,0.000,Evening,0
86,Floating,0,0,3279,0.000,Night,0
964,Pontoon,0,0,3275,0.000,Night,0


In [339]:
X_test[y_pred == 1]

Unnamed: 0,Bridge_Type,Ships_per_day,Closure_min,vehicle_flow_veh_hr,delay_min_per_vehicle,period_of_day,rush_hour
257,Pontoon,1,19,3000,0.034,Morning,0
774,Floating,1,17,3088,0.031,Morning,1
735,Floating,1,18,3125,0.003,Night,0
780,Swing,4,65,3285,0.005,Morning,0
71,Pontoon,2,32,3491,0.037,Afternoon,0
...,...,...,...,...,...,...,...
6,Pontoon,1,17,3111,0.030,Morning,1
794,Pontoon,2,28,3245,0.008,Morning,0
442,Swing,2,38,3290,0.030,Night,0
127,Floating,1,22,3197,0.011,Evening,1


In [335]:
from sklearn.metrics import accuracy_score , f1_score , classification_report

(accuracy_score(y_test , y_pred),
f1_score(y_test , y_pred))

(0.905, 0.9236947791164659)

In [336]:
print(classification_report(y_test ,y_pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.87        66
           1       1.00      0.86      0.92       134

    accuracy                           0.91       200
   macro avg       0.89      0.93      0.90       200
weighted avg       0.93      0.91      0.91       200



In [337]:
import joblib 

joblib.dump(pipeline , r'E:\Data science\Model_compress\Model\Finalmodel.pki')

['E:\\Data science\\Model_compress\\Model\\Finalmodel.pki']

In [338]:
X_test.columns

Index(['Bridge_Type', 'Ships_per_day', 'Closure_min', 'vehicle_flow_veh_hr',
       'delay_min_per_vehicle', 'period_of_day', 'rush_hour'],
      dtype='object')