In [107]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from lightgbm import LGBMClassifier     # pip install lightgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [53]:
df=pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')

In [54]:
df=df.sample(700000)

In [None]:
df1=pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')

In [56]:
class1_df = df1[df1['Severity'] == 1].copy()

Severity
1    67366
Name: count, dtype: int64

In [57]:
df0 = pd.concat([df, class1_df], ignore_index=True)

In [60]:
df0['Severity'].value_counts()

Severity
2    557925
3    117393
1     73428
4     18620
Name: count, dtype: int64

In [61]:
df=df0

In [62]:
df.columns

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [63]:
df.drop(columns={'End_Lat','End_Lng','Wind_Chill(F)','Precipitation(in)','Wind_Speed(mph)'},inplace=True)

In [64]:
df.drop('ID',axis=1,inplace=True)

In [65]:
df = df.drop([ 'Description', 'Start_Time', 'End_Time', 'Weather_Timestamp',
    'Street', 'Zipcode'
], axis=1)

In [66]:
df.shape

(767366, 34)

In [67]:
df.duplicated().sum()

17325

In [68]:
df.drop_duplicates(inplace=True)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750041 entries, 0 to 767365
Data columns (total 34 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Source                 750041 non-null  object 
 1   Severity               750041 non-null  int64  
 2   Start_Lat              750041 non-null  float64
 3   Start_Lng              750041 non-null  float64
 4   Distance(mi)           750041 non-null  float64
 5   City                   750025 non-null  object 
 6   County                 750041 non-null  object 
 7   State                  750041 non-null  object 
 8   Country                750041 non-null  object 
 9   Timezone               749376 non-null  object 
 10  Airport_Code           748043 non-null  object 
 11  Temperature(F)         734897 non-null  float64
 12  Humidity(%)            733868 non-null  float64
 13  Pressure(in)           737285 non-null  float64
 14  Visibility(mi)         733853 non-null  f

In [70]:
df.isna().sum()

Source                       0
Severity                     0
Start_Lat                    0
Start_Lng                    0
Distance(mi)                 0
City                        16
County                       0
State                        0
Country                      0
Timezone                   665
Airport_Code              1998
Temperature(F)           15144
Humidity(%)              16173
Pressure(in)             12756
Visibility(mi)           16188
Wind_Direction           16286
Weather_Condition        15879
Amenity                      0
Bump                         0
Crossing                     0
Give_Way                     0
Junction                     0
No_Exit                      0
Railway                      0
Roundabout                   0
Station                      0
Stop                         0
Traffic_Calming              0
Traffic_Signal               0
Turning_Loop                 0
Sunrise_Sunset            2260
Civil_Twilight            2260
Nautical

In [71]:
categorical_cols = [ 'City',  'Timezone', 'Airport_Code',
    'Wind_Direction', 'Weather_Condition', 'Sunrise_Sunset',
    'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'
]

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

numeric_cols = ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)']

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

df.isna().sum()

Source                   0
Severity                 0
Start_Lat                0
Start_Lng                0
Distance(mi)             0
City                     0
County                   0
State                    0
Country                  0
Timezone                 0
Airport_Code             0
Temperature(F)           0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Wind_Direction           0
Weather_Condition        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
Turning_Loop             0
Sunrise_Sunset           0
Civil_Twilight           0
Nautical_Twilight        0
Astronomical_Twilight    0
dtype: int64

In [72]:
le = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'bool':
        df[col] = le.fit_transform(df[col].astype(str))

In [74]:
y = df['Severity']
X = df.drop('Severity', axis=1)

In [75]:
num_cols = X.select_dtypes(include=['int64', 'float64', 'bool']).columns

st = StandardScaler()
X[num_cols] = st.fit_transform(X[num_cols])

In [76]:
num_classes = y.nunique()
print("Number of classes in Severity:", num_classes)

Number of classes in Severity: 4


In [77]:
y = le.fit_transform(df['Severity']) 

In [78]:
df['Severity'].value_counts()

Severity
2    552002
3    117019
1     62459
4     18561
Name: count, dtype: int64

In [79]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y    
)

# XGBoost

In [80]:
xgb_clf = XGBClassifier(
    objective='multi:softmax', 
    num_class=num_classes,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',       
    eval_metric='mlogloss',
    random_state=42
)

In [81]:
print("Original classes:", le.classes_)
print("Encoded classes:", sorted(set(y)))

Original classes: [1 2 3 4]
Encoded classes: [0, 1, 2, 3]


In [82]:
num_classes = len(le.classes_)
print("Number of classes:", num_classes)

Number of classes: 4


In [83]:
xgb_clf.fit(X_train, y_train)

In [84]:
y_pred = xgb_clf.predict(X_test)

In [85]:
print("\nClassification Report (encoded labels):\n")
print(classification_report(y_test, y_pred))


Classification Report (encoded labels):

              precision    recall  f1-score   support

           0       0.72      0.51      0.60     12492
           1       0.86      0.93      0.89    110401
           2       0.70      0.60      0.65     23404
           3       0.67      0.09      0.15      3712

    accuracy                           0.83    150009
   macro avg       0.74      0.53      0.57    150009
weighted avg       0.82      0.83      0.81    150009



In [88]:
!pip install imbalanced-learn==0.11.0

Collecting imbalanced-learn==0.11.0
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.11.0


In [93]:
from imblearn.over_sampling import SMOTE
X = df.drop(columns=['Severity'])
y = df['Severity']

y=le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 2) SMOTE on train only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [94]:
xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    min_child_weight=3,
    reg_alpha=0.5,
    reg_lambda=1,
    objective='multi:softprob',
    eval_metric='mlogloss',
    tree_method='hist'
)

xgb_model.fit(X_train_res, y_train_res)

NameError: name 'X_test_prep' is not defined

In [95]:
preds = xgb_model.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.49      0.79      0.61     12492
           1       0.91      0.76      0.83    110401
           2       0.55      0.72      0.63     23404
           3       0.18      0.33      0.23      3712

    accuracy                           0.75    150009
   macro avg       0.53      0.65      0.57    150009
weighted avg       0.80      0.75      0.76    150009



In [96]:
xgb_best = XGBClassifier(
    n_estimators=700,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=1,
    gamma=2,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1,
    reg_lambda=1.5,
    objective='multi:softprob',
    eval_metric='mlogloss',
    tree_method='hist'
)

xgb_best.fit(X_train_res, y_train_res)
preds = xgb_best.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.43      0.81      0.56     12492
           1       0.91      0.67      0.78    110401
           2       0.51      0.70      0.59     23404
           3       0.12      0.41      0.19      3712

    accuracy                           0.68    150009
   macro avg       0.49      0.65      0.53    150009
weighted avg       0.79      0.68      0.71    150009



In [98]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.03, 0.05, 0.1],
    'n_estimators': [400, 600, 800],
    'subsample': [0.7, 0.9, 1],
    'colsample_bytree': [0.7, 0.9, 1],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 1, 2],
    'reg_alpha': [0, 1],
    'reg_lambda': [1, 2]
}


In [99]:
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(
    xgb_model,
    param_grid,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train_res, y_train_res)
best_xgb = search.best_estimator_


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=1, gamma=1, learning_rate=0.03, max_depth=3, min_child_weight=5, n_estimators=400, reg_alpha=1, reg_lambda=2, subsample=0.7; total time= 5.9min
[CV] END colsample_bytree=1, gamma=0, learning_rate=0.03, max_depth=7, min_child_weight=3, n_estimators=600, reg_alpha=1, reg_lambda=2, subsample=1; total time=12.3min




[CV] END colsample_bytree=1, gamma=1, learning_rate=0.03, max_depth=3, min_child_weight=5, n_estimators=400, reg_alpha=1, reg_lambda=2, subsample=0.7; total time= 5.8min
[CV] END colsample_bytree=0.7, gamma=2, learning_rate=0.03, max_depth=7, min_child_weight=3, n_estimators=600, reg_alpha=0, reg_lambda=1, subsample=1; total time=12.6min
[CV] END colsample_bytree=1, gamma=1, learning_rate=0.03, max_depth=3, min_child_weight=5, n_estimators=400, reg_alpha=1, reg_lambda=2, subsample=0.7; total time= 5.8min
[CV] END colsample_bytree=0.7, gamma=2, learning_rate=0.03, max_depth=7, min_child_weight=3, n_estimators=600, reg_alpha=0, reg_lambda=1, subsample=1; total time=12.7min
[CV] END colsample_bytree=0.7, gamma=2, learning_rate=0.03, max_depth=7, min_child_weight=3, n_estimators=600, reg_alpha=0, reg_lambda=1, subsample=1; total time=12.4min
[CV] END colsample_bytree=1, gamma=0, learning_rate=0.03, max_depth=7, min_child_weight=3, n_estimators=600, reg_alpha=1, reg_lambda=2, subsample=1; t

KeyboardInterrupt: 

In [104]:

input_dim = X_train_res.shape[1]
num_classes = len(le.classes_)

model = Sequential([
    Dense(256, activation='relu', input_shape=(input_dim,)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ===========================
# 7) TRAINING
# ===========================

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train_res,
    y_train_res,
    validation_split=0.2,
    epochs=50,
    batch_size=512,
    callbacks=[early_stop],
    verbose=1
)

# ===========================
# 8) EVALUATION
# ===========================

# Predict probabilities then classes
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Decode labels back to original names
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

print("\nClassification Report (DNN):")
print(classification_report(y_test_labels, y_pred_labels))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2761/2761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.4044 - loss: 1.2723 - val_accuracy: 4.5290e-05 - val_loss: 3.2056
Epoch 2/50
[1m2761/2761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.5875 - loss: 0.9598 - val_accuracy: 0.0185 - val_loss: 1.6506
Epoch 3/50
[1m2761/2761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.6140 - loss: 0.9120 - val_accuracy: 0.0722 - val_loss: 3.2958
Epoch 4/50
[1m2761/2761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.6254 - loss: 0.8895 - val_accuracy: 0.1432 - val_loss: 1.6832
Epoch 5/50
[1m2761/2761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.6460 - loss: 0.8515 - val_accuracy: 0.1364 - val_loss: 1.6163
Epoch 6/50
[1m2761/2761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.6550 - loss: 0.8327 - val_accuracy: 0.0242 - val_loss: 4.8589
Epoch 7/50


In [108]:
models = {

    "LightGBM": LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=400,
        max_depth=25,
        min_samples_split=4,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=42
    ),

    "Logistic Regression": LogisticRegression(
        max_iter=500,
        multi_class="multinomial",
        solver="lbfgs",
        n_jobs=-1
    )
}

# ===========================
# 7) TRAIN + EVALUATE
# ===========================

results = {}



In [None]:
for name, model in models.items():
    print("\n" + "="*40)
    print(f" TRAINING: {name}")
    print("="*40)

    model.fit(X_train_res, y_train_res)

    y_pred_enc = model.predict(X_test)

    # رجّع labels لأسمائها الأصلية
    y_test_labels = le.inverse_transform(y_test)
    y_pred_labels = le.inverse_transform(y_pred_enc)

    print(f"\nClassification Report ({name}):")
    print(classification_report(y_test_labels, y_pred_labels))



 TRAINING: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.172185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2767
[LightGBM] [Info] Number of data points in the train set: 1766404, number of used features: 31
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294

Classification Report (LightGBM):
              precision    recall  f1-score   support

           1       0.53      0.73      0.61     12492
           2       0.89      0.84      0.86    110401
           3       0.61      0.65      0.63     23404
           4       0.26      0.21      0.23      3712

    accuracy                           0.79    150009
   macro avg       0.57      0.61      0.58    

In [59]:
import joblib

bundle = {
    "model": xgb_clf,   # حط هنا اسم الموديل اللي دربته
    "label_encoder": le  # حط هنا الـ LabelEncoder اللي استخدمته على y
}

joblib.dump(bundle, "severity_xgb_bundle.pkl")

print("Model saved successfully as severity_xgb_bundle.pkl")

Model saved successfully as severity_xgb_bundle.pkl
