## Data mining - Classification 2

We try to predict whether a traffic accident (in Vancouver) caused death or not, based on temperature, humidity, is_nighttime.
The classification algorithm used is SVM with polynomial kernel.

ref: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
engine = create_engine('postgresql://yliu359:PASSWORD@www.eecs.uottawa.ca:15432/group_21')

In [3]:
sql = "select weather_main, temperature, humidity, crime_type, is_nighttime from \
crime_data_mart.weather as W, crime_data_mart.crimefact as CF, crime_data_mart.crime as C where \
W.weather_key = CF.weather_key and \
CF.crime_key = C.crime_key and \
crime_type in ('traffic-accident-fatal', 'traffic-accident-injury')"

In [4]:
df = pd.read_sql(sql, engine)

In [5]:
# Handling class imbalance by under sampling
# is_nighttime == True is the minor class
minor_class_count = df[df.crime_type == 'traffic-accident-fatal'].shape[0]
tmp = df[df['crime_type']=='traffic-accident-injury'].sample(n=minor_class_count)
df.drop(df[df['crime_type'] == 'traffic-accident-injury'].index, inplace=True)
df = pd.concat([df, tmp])
df = shuffle(df)

In [6]:
X = df[['temperature', 'humidity', 'weather_main', 'is_nighttime']]
y = df['crime_type']

In [7]:
# Split train/test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
numeric_features = ['temperature', 'humidity']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['weather_main', 'is_nighttime']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

svm_clf = svm.SVC(kernel='poly')
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', svm_clf)])

In [9]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [10]:
clf.score(X_test, y_test)

0.6333333333333333