In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import category_encoders as ce
from joblib import dump

# load dataset
data = pd.read_csv("/root/volume/SKL2SQL/dataset/US_Accidents_March23_train.csv")
X = data.drop('Severity', axis=1)
y = data['Severity']
binary_encoder_cols = ['Airport_Code']
frequency_encoder_cols = ['Zipcode']
onehot_encoder_cols = ['Source', 'Timezone', 'Country']
numerical_cols = ['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)']
udf_cols = ['Description', 'Start_Time', 'Weather_Condition']
X = X[binary_encoder_cols + frequency_encoder_cols + onehot_encoder_cols + numerical_cols + udf_cols]

# clean data
for col in X.columns:
    if len(X[col].isna().unique()) > 1:
        # X[col] = X[col].fillna(0)
        most_common_value = X[col].value_counts().idxmax()  # 获取每列中出现最多的值
        X[col].fillna(most_common_value, inplace=True)  # 使用最常见的值填充 NaN

In [2]:
binary_encoder = ce.BinaryEncoder(cols=binary_encoder_cols)
X = binary_encoder.fit_transform(X)
counter_encoder = ce.CountEncoder(cols=frequency_encoder_cols)
before = X[frequency_encoder_cols]
X = counter_encoder.fit_transform(X)
after = X[frequency_encoder_cols]
count_encode_map = pd.concat((before, after), axis=1)
count_encode_map.columns = ['before', 'after']
print(count_encode_map.groupby('before'))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f84f025ef50>


In [5]:
cem = {}
for i in range(len(count_encode_map)):
    cem[count_encode_map.loc[i, 'before']] = count_encode_map.loc[i, 'after']
cem

{'75227': 439,
 '91607': 466,
 '45215': 193,
 '29169': 440,
 '94580-2454': 11,
 '11798-4413': 2,
 '30318-4407': 2,
 '33462': 312,
 '28262': 1763,
 '94607': 1802,
 '33778-3339': 2,
 '20755': 49,
 '10509-2327': 38,
 '90230': 1129,
 '49099-9166': 1,
 '32821-8031': 3,
 '78722': 293,
 '85006': 868,
 '91761': 3916,
 '23320-2605': 1,
 '32926': 350,
 '06076-3208': 1,
 '28205': 1401,
 '10512': 118,
 '95488': 15,
 '85286-6801': 1,
 '84116': 734,
 '36612': 48,
 '34234': 96,
 '19720-2381': 1,
 '96080-9728': 6,
 '33126': 1667,
 '61554': 107,
 '18974': 27,
 '60148': 534,
 '23005': 621,
 '49325-9621': 2,
 '55733': 63,
 '92833': 191,
 '29153-5018': 2,
 '33016': 1155,
 '61953': 13,
 '90640': 554,
 '94080-6510': 2,
 '97814': 427,
 '94558': 679,
 '43232-4808': 3,
 '95045': 338,
 '78753': 1300,
 '19720-3135': 91,
 '20744': 332,
 '70802': 1540,
 '77045-1237': 2,
 '44273-9371': 1,
 '90041-1727': 11,
 '28217-2155': 1,
 '95841': 920,
 '74105': 131,
 '50323': 83,
 '33166': 1563,
 '45417-1131': 1,
 '78731': 429

In [6]:
len(cem)

393507

In [11]:
l = [[k, v] for k, v in cem.items()]

In [13]:
l = pd.DataFrame(l)
l

Unnamed: 0,0,1
0,75227,439
1,91607,466
2,45215,193
3,29169,440
4,94580-2454,11
...,...,...
393502,27606-1173,1
393503,13478-3012,1
393504,85040-3618,1
393505,20902-2831,1


In [14]:
l.to_csv('/root/volume/SKL2SQL/dataset/US_Accidents_March23_train_zipcode_counter.csv',index=False,header=False)