In [2]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression

In [3]:
test_data = pd.read_csv("Classifying_accidents-test.csv")
train_data = pd.read_csv("Classifying_accidents-train.csv")

In [4]:
binary_features = ['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop','Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']

d = {False : 0, True : 1, 'Day' : 0, 'Night' : 1}
for feature in binary_features:
    train_data.dropna(subset=[feature], inplace=True)
    train_data[feature] = train_data[feature].map(d)

    test_data[feature] = test_data[feature].map(d)

In [5]:
train_data['Weather_Timestamp'] = pd.to_datetime(train_data['Weather_Timestamp'])
train_data['Weather_Year'] = train_data['Weather_Timestamp'].dt.year
train_data['Weather_Month'] = train_data['Weather_Timestamp'].dt.month
train_data['Weather_Day'] = train_data['Weather_Timestamp'].dt.day
train_data['Weather_Hour'] = train_data['Weather_Timestamp'].dt.hour
train_data.drop(columns=['Weather_Timestamp'], inplace=True)

test_data['Weather_Timestamp'] = pd.to_datetime(test_data['Weather_Timestamp'])
test_data['Weather_Year'] = test_data['Weather_Timestamp'].dt.year
test_data['Weather_Month'] = test_data['Weather_Timestamp'].dt.month
test_data['Weather_Day'] = test_data['Weather_Timestamp'].dt.day
test_data['Weather_Hour'] = test_data['Weather_Timestamp'].dt.hour
test_data.drop(columns=['Weather_Timestamp'], inplace=True)

In [6]:
categorical_features = ['Street','City', 'County', 'State','Zipcode','Timezone','Airport_Code', 'Weather_Condition']

for feature in categorical_features:
    train_freq_map = train_data[feature].value_counts(normalize=True)
    test_freq_map = test_data[feature].value_counts(normalize=True)
    
    train_data[feature] = train_data[feature].map(train_freq_map)
    test_data[feature] = test_data[feature].map(test_freq_map)

In [7]:
d1 = {'S' : 0,
     'WNW' : 1,
     'CALM' : 2,
     'SE' : 3,
     'W' : 4,
     'WSW' : 5,
     'SW' : 8,
     'ENE' : 9,
     'SSE' : 10,
     'West' : 4,
     'Calm' : 2,
     'N' : 11,
     'South' : 0,
     'ESE' : 12,
     'North' : 11,
     'E' : 13,
     'NE' : 14,
     'SSW' : 15,
     'East' : 13,
     'VAR' : 16,
     'NNE' : 17,
     'Variable' : 16,
     'NNW' : 18}
train_data['Wind_Direction'] = train_data['Wind_Direction'].map(d1)

test_data['Wind_Direction'] = test_data['Wind_Direction'].map(d1)

d2 = {'Source1' : 0, 'Source2' : 1}
train_data['Class'] = train_data['Class'].map(d2)



In [8]:
X = train_data.drop(['ID', 'Country', 'Class'], axis=1)
Y = train_data['Class']


test_id = test_data['ID']
test_data = test_data.drop(['ID','Country'], axis=1)



for col in X.columns:
    X_mean = X[col].value_counts(normalize=True).mean()
    test_mean = test_data[col].value_counts(normalize=True).mean()

    X[col] = X[col].fillna(X_mean)
    test_data[col] = test_data[col].fillna(test_mean)

print(X.head())
print(Y.head())
print(test_data.head())

   Severity  Start_Lat   Start_Lng       End_Lat       End_Lng  Distance(mi)  \
0         2  38.770702  -90.275398  9.698879e-07  9.534825e-07         0.000   
1         2  44.971004  -93.419037  9.698879e-07  9.534825e-07         0.000   
2         2  45.843022 -121.061216  4.584133e+01 -1.210522e+02         0.451   
3         2  35.300686  -80.806839  9.698879e-07  9.534825e-07         0.000   
4         2  34.064580 -117.796451  3.407003e+01 -1.178076e+02         0.741   

     Street      City    County     State  ...  Traffic_Signal  Turning_Loop  \
0  0.000005  0.002226  0.002222  0.009751  ...               0             0   
1  0.000301  0.000416  0.007611  0.024873  ...               0             0   
2  0.000002  0.000025  0.000043  0.014001  ...               0             0   
3  0.000009  0.017908  0.019015  0.043807  ...               0             0   
4  0.000126  0.001306  0.067095  0.222341  ...               0             0   

   Sunrise_Sunset  Civil_Twilight  Nau

RANDOM UNDER SAMPLING

In [9]:
rus = RandomUnderSampler(sampling_strategy="not minority")

X_res, Y_res = rus.fit_resample(X, Y)

model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_res, Y_res)

prediction = model.predict(test_data)

submission = pd.DataFrame({'ID': test_id, 'Source': prediction})

submission.to_csv('RUS_submission.csv', index=False)

RANDOM OVER SAMPLING

In [10]:
ros = RandomOverSampler(random_state=42)

X_res, Y_res = ros.fit_resample(X,Y)

model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_res, Y_res)

prediction = model.predict(test_data)

submission = pd.DataFrame({'ID': test_id, 'Source': prediction})

submission.to_csv('ROS_submission.csv', index=False)