In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier 
from sklearn.impute import SimpleImputer

In [6]:
df = pd.read_csv("D:\\ml project\\data\\EDA_aviation_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72889 entries, 0 to 72888
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0.1            72889 non-null  int64  
 1   Unnamed: 0              72889 non-null  int64  
 2   event_id                72889 non-null  object 
 3   investigation_type      72889 non-null  object 
 4   event_date              72889 non-null  object 
 5   location                72860 non-null  object 
 6   country                 72677 non-null  object 
 7   injury_severity         72889 non-null  object 
 8   aircraft_damage         70773 non-null  object 
 9   aircraft_category       72889 non-null  object 
 10  make                    72889 non-null  object 
 11  model                   72889 non-null  object 
 12  amateur_built           72888 non-null  object 
 13  number_of_engines       68973 non-null  float64
 14  engine_type             68359 non-null

  df = pd.read_csv("D:\\ml project\\data\\EDA_aviation_data.csv")


In [7]:
df = df.drop(columns = ["Unnamed: 0","event_id" ,"model","location"],errors = "ignore")
df.columns.tolist()

['Unnamed: 0.1',
 'investigation_type',
 'event_date',
 'country',
 'injury_severity',
 'aircraft_damage',
 'aircraft_category',
 'make',
 'amateur_built',
 'number_of_engines',
 'engine_type',
 'purpose_of_flight',
 'total_fatal_injuries',
 'total_serious_injuries',
 'total_minor_injuries',
 'total_uninjured',
 'weather_condition',
 'broad_phase_of_flight']

In [8]:
df["event_date"] = pd.to_datetime(df["event_date"],errors = "coerce")
df["year"] = df["event_date"].dt.year
df["month"] =  df["event_date"].dt.month
df["day"] = df["event_date"].dt.day

In [9]:
df = df.drop(columns = ["event_date"])

In [10]:
df["target"]= df["injury_severity"].map({"fatal":1,"non-fatal":0}) 
df = df.drop(columns = ["injury_severity"])

In [11]:
low_cardinality = [
    "investigation_type",
    "aircraft_damage",
    "aircraft_category",
    "amateur_built",
    "purpose_of_flight",
    "weather_condition",
    "broad_phase_of_flight",
    "engine_type"
]

high_cardinality = ["make", "country"]

# NUMERIC
numeric_features = [
    "number_of_engines",
    "year", "month", "day"
]

In [12]:
cols_to_drop = [
    "total_fatal_injuries",
    "total_serious_injuries",
    "total_minor_injuries",
    "total_uninjured"
]
df = df.drop(columns=cols_to_drop, errors="ignore")

In [13]:
x = df[low_cardinality + high_cardinality + numeric_features]
y = df["target"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2 , random_state = 42 ,stratify = y)

In [14]:
preprocess = ColumnTransformer(
    transformers=[
        ("onehot", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("encode", OneHotEncoder(handle_unknown="ignore"))
        ]), low_cardinality),

        ("target", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("encode", TargetEncoder())
        ]), high_cardinality),

        ("numeric", Pipeline([
            ("impute", SimpleImputer(strategy="mean"))
        ]), numeric_features)
    ]
)

In [15]:
df.to_csv("D:\\ml project\\data\\final_aviation_data.csv", index=False)

In [16]:
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        random_state=42
    ))
])

In [17]:
model.fit(x_train,y_train)

0,1,2
,steps,"[('preprocess', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('target', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93     11642
           1       0.76      0.63      0.69      2936

    accuracy                           0.88     14578
   macro avg       0.83      0.79      0.81     14578
weighted avg       0.88      0.88      0.88     14578



## TRAINING XGboost 

In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
    --------------------------------------- 1.6/72.0 MB 14.1 MB/s eta 0:00:05
   -- ------------------------------------- 4.5/72.0 MB 14.2 MB/s eta 0:00:05
   ---- ----------------------------------- 8.9/72.0 MB 16.8 MB/s eta 0:00:04
   ------- -------------------------------- 13.4/72.0 MB 17.9 MB/s eta 0:00:04
   ---------- ----------------------------- 18.1/72.0 MB 19.4 MB/s eta 0:00:03
   ----------- ---------------------------- 20.4/72.0 MB 18.2 MB/s eta 0:00:03
   ------------ --------------------------- 21.8/72.0 MB 16.2 MB/s eta 0:00:04
   ------------ --------------------------- 23.1/72.0 MB 14.9 MB/s eta 0:00:04
   -------------- ------------------------- 25.4/72.0 MB 14.4 MB/s eta 0:00:04
   --------------- ------------------------ 28.3/72.0 MB 14.3 MB/s eta 0:00



In [2]:
from xgboost import XGBClassifier

In [21]:
xgb = XGBClassifier(n_estimators = 500, max_depth = 6, learning_rate = 0.05,subsample = 0.8,colsample_bytree = 0.8,scale_pos_weight = (len(y_train)/y_train.sum()),eval_metrix = "logloss",random_state = 42)
model = Pipeline(steps = [("prepocessor",preprocess),("classifier",xgb)])


In [22]:
model.fit(x_train,y_train)
preds  = model.predict(x_test)
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91     11642
           1       0.62      0.81      0.70      2936

    accuracy                           0.86     14578
   macro avg       0.78      0.84      0.80     14578
weighted avg       0.88      0.86      0.87     14578

