In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump

In [2]:
# load dataset
data = pd.read_csv("/root/volume/SKL2SQL/dataset/US_Accidents_March23_train.csv")
X = data.drop('Severity', axis=1)
y = data['Severity']
binary_encoder_cols = ['Airport_Code']
frequency_encoder_cols = ['Zipcode']
onehot_encoder_cols = ['Source', 'Timezone', 'Country']
numerical_cols = ['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)']
udf_cols = ['Description', 'Start_Time', 'Weather_Condition']
X = X[binary_encoder_cols + frequency_encoder_cols + onehot_encoder_cols + numerical_cols + udf_cols]

In [3]:
X = X[0:10000]
y = y[0:10000]

In [4]:
# clean data
for col in X.columns:
    if len(X[col].isna().unique()) > 1:
        # X[col] = X[col].fillna(0)
        most_common_value = X[col].value_counts().idxmax()  # 获取每列中出现最多的值
        X[col].fillna(most_common_value, inplace=True)  # 使用最常见的值填充 NaN

for col in X.columns:
    if len(X[col].isna().unique()) > 1:
        print(col)

In [5]:
# preprocess
binary_encoder = ce.BinaryEncoder(cols=binary_encoder_cols)
X = binary_encoder.fit_transform(X)
counter_encoder = ce.CountEncoder(cols=frequency_encoder_cols)
print(X[frequency_encoder_cols])
X = counter_encoder.fit_transform(X)
print(X[frequency_encoder_cols])

         Zipcode
0          75227
1          91607
2          45215
3          29169
4     94580-2454
...          ...
9995  85749-9097
9996  33174-1630
9997  91950-7833
9998       92705
9999       60517

[10000 rows x 1 columns]
      Zipcode
0           3
1           3
2           3
3           1
4           1
...       ...
9995        1
9996        1
9997        1
9998        2
9999        3

[10000 rows x 1 columns]


In [6]:
X[frequency_encoder_cols] = X[frequency_encoder_cols] / len(X)
print(X[frequency_encoder_cols])

      Zipcode
0      0.0003
1      0.0003
2      0.0003
3      0.0001
4      0.0001
...       ...
9995   0.0001
9996   0.0001
9997   0.0001
9998   0.0002
9999   0.0003

[10000 rows x 1 columns]


In [7]:
X.columns

Index(['Airport_Code_0', 'Airport_Code_1', 'Airport_Code_2', 'Airport_Code_3',
       'Airport_Code_4', 'Airport_Code_5', 'Airport_Code_6', 'Airport_Code_7',
       'Airport_Code_8', 'Airport_Code_9', 'Airport_Code_10', 'Zipcode',
       'Source', 'Timezone', 'Country', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Description',
       'Start_Time', 'Weather_Condition'],
      dtype='object')

In [8]:
X['Description']

0       #4 lane blocked due to accident on I-30 Westbo...
1       Slow traffic on Ventura Fwy W - US-101 N from ...
2       Center lane blocked due to accident on I-75 So...
3                 Accident on I-26 Eastbound at Exit 111.
4                               At Bockman Rd - Accident.
                              ...                        
9995    Incident on N SOLDIER TR near E FORT LOWELL RD...
9996    Stationary traffic on FL-985 from SW 105th Pl ...
9997               At Plaza Bonita Center Way - Accident.
9998                        At Seventeenth St - Accident.
9999    Incident on I-355 SB near 71ST ST Left lane bl...
Name: Description, Length: 10000, dtype: object

In [9]:
X['Description'] = X['Description'].apply(lambda row: len(str(row).split(' ')))

In [10]:
X['Description']

0       14
1       20
2       12
3        7
4        5
        ..
9995    13
9996    15
9997     7
9998     5
9999    10
Name: Description, Length: 10000, dtype: int64

In [11]:
X['Start_Time']

0                 2020-09-08 20:23:48
1                 2023-01-20 22:02:00
2                 2021-12-20 14:36:18
3                 2020-09-17 16:01:59
4                 2019-10-16 15:59:00
                    ...              
9995              2022-08-18 18:27:41
9996              2021-03-22 06:31:41
9997              2020-01-27 15:24:00
9998              2017-11-07 11:14:44
9999    2022-07-29 22:57:49.000000000
Name: Start_Time, Length: 10000, dtype: object

In [12]:
X['Start_Time'] = X['Start_Time'].apply(lambda time: int(str(time)[0:4])*12 + int(str(time)[5:7]))

In [13]:
X['Start_Time']

0       24249
1       24277
2       24264
3       24249
4       24238
        ...  
9995    24272
9996    24255
9997    24241
9998    24215
9999    24271
Name: Start_Time, Length: 10000, dtype: int64

In [14]:
X['Weather_Condition'].unique()

array(['Fair', 'Heavy Rain', 'Cloudy', 'Clear', 'Fog', 'Fair / Windy',
       'Scattered Clouds', 'Mostly Cloudy', 'Partly Cloudy', 'Overcast',
       'Light Snow', 'Light Rain', 'Heavy T-Storm', 'Thunder',
       'Cloudy / Windy', 'Snow', 'Smoke', 'N/A Precipitation',
       'Mostly Cloudy / Windy', 'Partly Cloudy / Windy', 'Haze',
       'Light Thunderstorms and Rain', 'Rain', 'Light Freezing Drizzle',
       'Light Rain with Thunder', 'Patches of Fog', 'Wintry Mix',
       'Thunder in the Vicinity', 'Blowing Snow / Windy',
       'Heavy Thunderstorms and Rain', 'Light Snow / Windy',
       'Light Freezing Rain', 'Thunderstorms and Rain', 'Mist',
       'Heavy Snow / Windy', 'T-Storm', 'Light Rain / Windy',
       'Rain / Windy', 'Snow / Windy', 'Light Drizzle', 'Heavy Snow',
       'Thunderstorm', 'Drizzle', 'Heavy Rain / Windy', 'Blowing Dust',
       'Shallow Fog', 'Showers in the Vicinity', 'Fog / Windy',
       'Light Drizzle / Windy', 'Haze / Windy', 'Freezing Drizzle',
       

In [15]:
def f(weather: str):
    bad_conditions = ['rain', 'snow', 'fog', 'wind', 'thunder', 'storm', 'drizzle', 'sand', 'whirlwind']
    weather = weather.lower()
    bad_value = 0
    for bad_cond in bad_conditions:
        if bad_cond in weather:
            bad_value += 1
    return bad_value

In [16]:
X['Weather_Condition'] = X['Weather_Condition'].apply(lambda weather: f(str(weather)))

In [17]:
X['Weather_Condition']

0       0
1       0
2       0
3       1
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: Weather_Condition, Length: 10000, dtype: int64

In [18]:
X

Unnamed: 0,Airport_Code_0,Airport_Code_1,Airport_Code_2,Airport_Code_3,Airport_Code_4,Airport_Code_5,Airport_Code_6,Airport_Code_7,Airport_Code_8,Airport_Code_9,...,Timezone,Country,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Description,Start_Time,Weather_Condition
0,0,0,0,0,0,0,0,0,0,0,...,US/Central,US,76.0,76.0,82.0,29.40,10.0,14,24249,0
1,0,0,0,0,0,0,0,0,0,1,...,US/Pacific,US,46.0,46.0,68.0,29.42,10.0,20,24277,0
2,0,0,0,0,0,0,0,0,0,1,...,US/Eastern,US,43.0,39.0,56.0,29.68,10.0,12,24264,0
3,0,0,0,0,0,0,0,0,1,0,...,US/Eastern,US,78.0,78.0,87.0,29.43,3.0,7,24249,1
4,0,0,0,0,0,0,0,0,1,0,...,US/Pacific,US,64.0,64.0,65.0,29.88,10.0,5,24238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,1,1,1,0,0,...,US/Mountain,US,83.0,83.0,64.0,27.06,10.0,13,24272,0
9996,0,0,0,0,0,0,1,1,1,1,...,US/Eastern,US,60.0,60.0,80.0,30.00,10.0,15,24255,0
9997,0,1,0,0,1,0,0,0,0,0,...,US/Pacific,US,65.0,65.0,61.0,30.15,8.0,7,24241,0
9998,0,0,0,0,1,1,1,1,1,0,...,US/Pacific,US,73.0,73.0,48.0,30.09,10.0,5,24215,0


In [19]:
# define pipline
std_scalar = StandardScaler(with_mean=False)
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
pipeline_transforms = []
pipeline_transforms.append(('StandardScaler', std_scalar, numerical_cols))
pipeline_transforms.append(('OneHotEncoder', onehot_encoder, onehot_encoder_cols))
pipeline_transforms = sorted(pipeline_transforms, key=lambda x: x[0], reverse=True)
pipeline_transforms = ('pipeline_transforms',
                        ColumnTransformer(remainder='passthrough',
                                            transformers=pipeline_transforms))
rf = RandomForestClassifier(max_depth=10, n_estimators=4, random_state=24)
pipeline_estimator = ('RandomForestClassifier', rf)
pipeline = Pipeline(steps=[pipeline_transforms, pipeline_estimator])

In [20]:
y = y[0:10000]
y

0       3
1       2
2       3
3       2
4       2
       ..
9995    2
9996    2
9997    2
9998    2
9999    2
Name: Severity, Length: 10000, dtype: int64

In [23]:
# train model
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('pipeline_transforms',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('StandardScaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=False,
                                                                 with_std=True),
                                                  ['Temperature(F)',
                                                   'Wind_Chill(F)',
                                                   'Humidity(%)',
                                                   'Pressure(in)',
                                                   'Visibility(mi)']),
                                                 ('OneHotEncoder',
                                

In [27]:
pipeline.predict(X[0:10])

array([3, 2, 3, 2, 2, 2, 2, 3, 2, 3])