In [None]:
import pandas as pd
from math import ceil
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sqlalchemy import create_engine, text as sql_text

from phik.report import plot_correlation_matrix

from catboost import CatBoostClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import fbeta_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import torch
from torch import nn

import itertools
import tqdm
from math import ceil
import matplotlib.pyplot as plt

mpl.rcParams['agg.path.chunksize'] = 10000

In [None]:
# Подключение к базе данных
db_config = {
'user': 'praktikum_student',
'pwd': 'Sdf4$2;d-d30pp',
'host': 'rc1b-wcoijxj3yxfsf3fs.mdb.yandexcloud.net',
'port': 6432,
'db': 'data-science-vehicle-db'
}

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(
    db_config['user'],
    db_config['pwd'],
    db_config['host'],
    db_config['port'],
    db_config['db']
)

connection = create_engine(connection_string)

In [None]:
def get_query(query):
    query_df = pd.read_sql_query(con=connection.connect(), sql=sql_text(query))
    return query_df

In [None]:
case_ids_query = '''
SELECT *
FROM case_ids;
'''

case_ids_df = get_query(case_ids_query)
display(case_ids_df.head())

In [None]:
parties_query = '''
SELECT *
FROM parties
WHERE party_type = 'car';
'''

parties_df = get_query(case_ids_query)
display(parties_df.head())

In [None]:
collisions_query = '''
SELECT *
FROM collisions;
'''

collisions_df = get_query(case_ids_query)
display(collisions_df.head())

In [None]:
vehicles_query = '''
SELECT *
FROM vehicles;
'''

vehicles_df = get_query(case_ids_query)
display(vehicles_df.head())

In [None]:
query = '''
SELECT to_char(collision_date::date, 'MONTH') AS month,
    COUNT(case_id)
FROM collisions
GROUP BY month;
'''

query_df = pd.read_sql_query(con=connection.connect(), sql=sql_text(query))

months = ["DECEMBER", "JANUARY", "FEBRUARY",
          "MARCH",  "APRIL", "MAY", 
          "JUNE", "JULY", "AUGUST",
          "SEPTEMBER", "OCTOBER", "NOVEMBER"]

query_df["month"] = query_df["month"].apply(lambda x: x.strip())

a = []
b = []

for i in range(len(months)):
    a.append(query_df[query_df["month"] == months[i]]["month"].to_list()[0])
    b.append(query_df[query_df["month"] == months[i]]["count"].to_list()[0])

query_df["month"] = a
query_df["count"] = b

display(query_df)

In [None]:
plt.figure(figsize=(7, 7))

plt.bar(
    query_df['month'],
    query_df['count'],
    color='red'
)

plt.title("Количество аварий по месяцам", fontsize=16)
plt.xlabel("Месяц", fontsize=14)
plt.xticks(rotation=90, fontsize=12)
plt.ylabel("Количество аварий", fontsize=14)
plt.yticks(fontsize=12)
plt.show()

In [None]:
query = '''
WITH collision_damage_table AS 
    (SELECT case_id,
        collision_damage
    FROM collisions),
number_of_coll AS 
    (SELECT p.case_id,
        p.party_drug_physical,
        cdt.collision_damage,
        COUNT(*) OVER (PARTITION BY p.party_drug_physical, cdt.collision_damage)
    FROM parties AS p
    JOIN collision_damage_table AS cdt ON p.case_id = cdt.case_id
    WHERE p.party_type = 'car')


SELECT party_drug_physical,
    collision_damage,
    count
FROM number_of_coll
WHERE party_drug_physical NOT IN ('G', 'not applicable', 'None')
GROUP BY party_drug_physical, collision_damage, count;
'''

query_df = get_query(query)

query_df.replace('impairment - physical', 'Ухудшение состояния', inplace=True)
query_df.replace('sleepy/fatigued', 'Сонный/Усталый', inplace=True)
query_df.replace('under drug influence', 'Под воздействием лекарств', inplace=True)

query_df.replace('fatal', 'Не подлежит восстановлению', inplace=True)
query_df.replace('middle damage', 'Машина в целом на ходу', inplace=True)
query_df.replace('scratch', 'Царапина', inplace=True)
query_df.replace('severe damage', 'Серьезное повреждение', inplace=True)
query_df.replace('small damage', 'Отедльный элемент под замену/покраску', inplace=True)

display(query_df)

In [None]:
plt.figure(figsize=(7, 7))

plt.scatter(
    query_df['party_drug_physical'],
    query_df['collision_damage'],
    alpha=0.85,
    s=query_df['count'],
)

plt.title('Зависимость между состоянием участника и серьезностью повреждения', fontsize=16)
plt.xlabel('Состояние участника', fontsize=14)
plt.xticks(fontsize=12)
plt.ylabel('Серьезность происшествия', fontsize=14)
plt.yticks(fontsize=12)
plt.show()

In [None]:
query = '''
WITH primary_factor_table AS 
    (SELECT case_id,
        primary_collision_factor
    FROM collisions),
collision_sobriety AS 
    (SELECT p.case_id,
        pft.primary_collision_factor,
        p.party_sobriety,
        COUNT(*) OVER (PARTITION BY pft.primary_collision_factor, p.party_sobriety)
    FROM parties AS p
    JOIN primary_factor_table AS pft ON p.case_id = pft.case_id
    WHERE p.party_type = 'car')


SELECT primary_collision_factor,
    party_sobriety,
    count
FROM collision_sobriety
WHERE primary_collision_factor NOT IN ('unknown', 'None')
    AND party_sobriety NOT IN ('not applicable')
GROUP BY primary_collision_factor, party_sobriety, count;
'''

query_df = get_query(query)

query_df.replace('fell asleep', 'Уснул', inplace=True)
query_df.replace('other improper driving', 'Другое неправильное вождение', inplace=True)
query_df.replace('other than driver', 'Кроме водителя', inplace=True)
query_df.replace('vehicle code violation', 'Нарушение правил ПДД', inplace=True)

query_df.replace('had not been drinking', 'Не пил', inplace=True)
query_df.replace('had been drinking, not under influence', 'Был пьян, не под влиянием', inplace=True)
query_df.replace('had been drinking, impairment unknown', 'Был пьян, ухудшение неизвестно', inplace=True)
query_df.replace('had been drinking, under influence', 'Был пьян, под влиянием', inplace=True)
query_df.replace('impairment unknown', 'Неизвестно ухудшение', inplace=True)

display(query_df)

In [None]:
def sobriety_collision_graph(df, size=0.005):
    plt.figure(figsize=(7, 7))

    plt.scatter(
        df['primary_collision_factor'],
        df['party_sobriety'],
        s=df['count'] * size,
        alpha=0.85,
        color='c'
    )

    plt.title('Зависимость между основным фактором ДТП и трезвостью участника', fontsize=16)
    plt.xlabel('Основной фактор аварии', fontsize=14)
    plt.xticks(rotation=90, fontsize=12)
    plt.ylabel('Трезвость участника', fontsize=14)
    plt.yticks(fontsize=12)
    plt.show()

sobriety_collision_graph(query_df)

query_test = query_df[query_df['party_sobriety'] != 'Не пил']
print('\n\nГрафик зависимости между основным фактором ДТП и трезвостью участника БЕЗ значения "Не пил":\n')
sobriety_collision_graph(query_test, size=0.125)

In [None]:
# Необходимые для модели параметры
parameters_query = '''
SELECT p.at_fault,
    p.cellphone_in_use,
    p.party_sobriety,
    c.collision_damage,
    EXTRACT(MONTH FROM c.collision_date)::int AS month,
    EXTRACT(DAY FROM c.collision_date)::int AS day,
    EXTRACT(HOUR FROM c.collision_time)::int AS hour,
    c.control_device,
    c.county_city_location,
    c.direction,
    c.intersection,
    c.lighting,
    c.location_type,
    c.road_condition_1 AS road_condition,
    c.road_surface,
    v.vehicle_age,
    v.vehicle_transmission,
    v.vehicle_type,
    c.weather_1 AS weather
FROM parties AS p
JOIN collisions AS c ON p.case_id = c.case_id
JOIN vehicles AS v ON p.case_id = v.case_id
WHERE p.party_type = 'car'
    AND EXTRACT(YEAR FROM c.collision_date)::int = 2012
    AND collision_damage != 'scratch';
'''

parameters_df = get_query(parameters_query)
display(parameters_df.head())

In [None]:
print('Размер полученного набора данных:', parameters_df.shape)

In [None]:
print('Пропущенные значения по стобцам:\n\n', parameters_df.isna().sum())

In [None]:
print('Количество значений больше 1000, поэтому их нельзя удалить.\n')
print("Распределение значений параметров 'cellphone_in_use' ДО:")
display(parameters_df['cellphone_in_use'].value_counts())

print('Заметно, что в подавляющем большинстве случаев в автомобиле отсутствует телефон')
print('Ввиду этого заполним cellphone_in_use значением "0"')
parameters_df['cellphone_in_use'].fillna(0, inplace=True)

print("\nРаспределение значений параметров 'cellphone_in_use' ПОСЛЕ:")
display(parameters_df['cellphone_in_use'].value_counts())

In [None]:
parameters_df = parameters_df[~parameters_df['hour'].isna()]

In [None]:
parameters_df = parameters_df[~parameters_df['control_device'].isna()]

In [None]:
print("Распределение значений параметров 'direction':")
display(parameters_df['direction'].value_counts())

parameters_df['direction'].fillna('None', inplace=True)

In [None]:
parameters_df = parameters_df[~parameters_df['intersection'].isna()]

In [None]:
parameters_df = parameters_df[~parameters_df['lighting'].isna()]

In [None]:
print("Распределение значений параметров 'location_type':")
display(parameters_df['location_type'].value_counts())

parameters_df['location_type'].fillna('none', inplace=True)

In [None]:
parameters_df = parameters_df[~parameters_df['road_condition'].isna()]

In [None]:
parameters_df = parameters_df[~parameters_df['road_surface'].isna()]

In [None]:
print("Распределение значений параметров 'vehicle_age':")
display(parameters_df['vehicle_age'].value_counts())

parameters_df['vehicle_age'].fillna(parameters_df['vehicle_age'].median(), inplace=True)

In [None]:
print("Распределение значений параметров 'vehicle_transmission':")
display(parameters_df['vehicle_transmission'].value_counts())

parameters_df['vehicle_transmission'].fillna('none', inplace=True)

In [None]:
parameters_df = parameters_df[~parameters_df['weather'].isna()]

In [None]:
print('Количество значений больше 1000, поэтому их нельзя удалить.\n')
print("Распределение значений параметров 'party_sobriety' ДО:")
display(parameters_df['party_sobriety'].value_counts())

print('Заметно, что в подавляющем большинстве случаев пользователь не пил')
print('Однако, наверное, заполним party_sobriety значением "impairment unknown", так как достаточно большая доля есть с этим значеннием')
parameters_df['party_sobriety'].fillna("impairment unknown", inplace=True)

print("\nРаспределение значений параметров 'party_sobriety' ПОСЛЕ:")
display(parameters_df['party_sobriety'].value_counts())

In [None]:
phik_overview = parameters_df.phik_matrix()
phik_overview.round(2)

plot_correlation_matrix(
    phik_overview.values,
    x_labels=phik_overview.columns,
    y_labels=phik_overview.index,
    vmin=0,
    vmax=1,
    figsize=(14, 10)
)

In [None]:
parameters_df.describe()

In [None]:
print(parameters_df["vehicle_age"][parameters_df["vehicle_age"] > 135].sum())
parameters_df["vehicle_age"][parameters_df["vehicle_age"] > 135] = None # В теории можно просто заменить значение на 16, так как самих выбросов крайне мало
parameters_df = parameters_df.dropna(axis=0)

In [None]:
parameters_df.describe()

In [None]:
parameters_df.duplicated().sum()

In [None]:
parameters_df = parameters_df.drop_duplicates()

In [None]:
parameters_df.duplicated().sum()

In [None]:
RANDOM_SEED = 12345

In [None]:
pd.options.mode.chained_assignment = None

categorical_cols = ['cellphone_in_use', 'collision_damage', 'month', 'day', 'hour', 'control_device', 
                    'county_city_location','direction', 'intersection','lighting', 'location_type', 
                    'road_condition', 'road_surface', 'vehicle_transmission', 'vehicle_type', 'weather', "party_sobriety"]
numeric_col = ['vehicle_age']

X = parameters_df.drop('at_fault', axis=1)
y = parameters_df['at_fault']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=RANDOM_SEED, test_size=0.25
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, random_state=RANDOM_SEED, test_size=0.25
)

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(X_train[categorical_cols])
X_train[categorical_cols] = encoder.transform(X_train[categorical_cols])
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols])
X_val[categorical_cols] = encoder.transform(X_val[categorical_cols])

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [None]:
scoring = ['fbeta', 'roc_auc']

In [None]:
catboost = CatBoostClassifier(task_type="GPU", devices="0:1", early_stopping_rounds=5)

grid = {
    "n_estimators":[1000],
    "depth":[3, 5, 7, 9],
    "random_seed":[RANDOM_SEED],
    'learning_rate': [0.03, 0.1],
}

grid_search_result = catboost.grid_search(grid,
                                       X=X_train,
                                       y=y_train)

In [None]:
grid_search_result["params"]

In [None]:
catboost = CatBoostClassifier(task_type="GPU", devices="0:1", random_seed=RANDOM_SEED, learning_rate=0.1, depth=9, use_best_model=True)

In [None]:
catboost.fit(X_train, y_train, eval_set=(X_val, y_val))

In [None]:
catboost_prediction = catboost.predict(X_val)

In [None]:
print(f"""Fbeta_score = {fbeta_score(y_val, catboost_prediction, beta=2):0.2f}
Prescision = {precision_score(y_val, catboost_prediction,):0.2f}
Recall = {recall_score(y_val, catboost_prediction,):0.2f}""")

In [None]:
rfc = RandomForestClassifier(random_state=RANDOM_SEED)

In [None]:
param_grid = {
    'max_depth': [5, 10, 15], 
    'n_estimators': [50, 100, 150]
}

In [None]:
grid = GridSearchCV(rfc, param_grid, scoring='f1', refit='f1', n_jobs=4)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)

In [None]:
rfc = RandomForestClassifier(max_depth=5, n_estimators=50, random_state=RANDOM_SEED, n_jobs=4)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
rfc_prediction = rfc.predict(X_val)

In [None]:
print(f"""Fbeta_score = {fbeta_score(y_val, rfc_prediction, beta=2):0.2f}
Prescision = {precision_score(y_val, rfc_prediction, ):0.2f}
Recall = {recall_score(y_val, rfc_prediction, ):0.2f}""")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

In [None]:
X_train = torch.tensor(X_train, device=device)
X_val = torch.tensor(X_val, device=device)
y_train = torch.tensor(y_train.values, device=device, dtype=torch.float64)
y_val = torch.tensor(y_val.values, device=device, dtype=torch.float64)


In [None]:
def cross_validate(params):
    param_names = list(params.keys())
    param_values = [params[param_name] for param_name in param_names]

    param_combinations = list(itertools.product(*param_values))

    for combination in param_combinations:
        params_for_net = {param_names[i]: combination[i] for i in range(len(param_names))}
        yield params_for_net

In [None]:
def custom_nn(params: dict, X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val):
    input_neurons = params.get("input_neurons", X_train.shape[1])
    output_neurons = params.get("output_neurons", 1)
    n_combinated_layers = params.get("n_combinated_layers", 5)
    function_activation = params.get("function_activation", nn.LeakyReLU(0.2))
    drop_out_every_layer = params.get("drop_out_every_layer", 3)
    drop_out_part = params.get("drop_out_part", 0.25)
    num_epochs = params.get("num_epochs", 1000)
    batch_size = params.get("batch_size", 10)
    device = params.get("device", "cuda")
    f1_ = params.get("f1", 0)
    middle_layers = params.get("middle_layers")
    
    # gettting layers
    n_neruons = [input_neurons]

    for i in range(n_combinated_layers):
        for g in middle_layers:
            n_neruons.append(g)
    n_neruons.append(output_neurons)

    # creating structure
    net_layers = []

    for i in range(1, len(n_neruons) - 1):
        if i == 1 or i % drop_out_every_layer != 0:
            net_layers.append(nn.Linear(n_neruons[i - 1], n_neruons[i]))
            net_layers.append(function_activation)   
        else:
            net_layers.append(nn.Dropout(drop_out_part))
            net_layers.append(nn.Linear(n_neruons[i - 1], n_neruons[i]))
            net_layers.append(function_activation)

    net_layers.append(nn.Linear(n_neruons[-2], n_neruons[-1]))
    net_layers.append(nn.Sigmoid())

    net = nn.Sequential(*net_layers).to(dtype=torch.float64, device=device)

    optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
    loss = nn.BCELoss().to(device=device)

    num_batches = ceil(len(X_train)/batch_size)

    best_fbeta = 0
    best_optimizer = None
    best_net = None

    # training_model
    for epoch in tqdm.tqdm(range(num_epochs)):
        order = np.random.permutation(len(X_train))
        for batch_idx in range(num_batches):
            start_index = batch_idx * batch_size
            optimizer.zero_grad()

            batch_indexes = order[start_index:start_index+batch_size]
            X_batch = X_train[batch_indexes]
            y_batch = y_train[batch_indexes]

            preds = net(X_batch).flatten()
                
            loss_value = loss(preds, y_batch)

            loss_value.backward()
                
            optimizer.step()
                
            if epoch % 10 == 0 or epoch == num_epochs - 1:
                net.eval()
                test_preds = net(X_val)
                fbeta = fbeta_score(test_preds.to("cpu").detach().numpy().round(), y_val.to("cpu").detach().numpy(), beta=2)

                if fbeta > best_fbeta:
                    best_fbeta = fbeta
                    best_optimizer = optimizer
                    best_net = net
                    

    return best_net, best_optimizer, best_fbeta

In [None]:
input_neruons = X_train.shape[1]

In [None]:
params = {
    "middle_layers":[[input_neruons * 2, input_neruons * 2, input_neruons],],
    "n_combinated_layers":[1, 5],
    "num_epochs":[1000, 1500],
    "batch_size":[X_train.shape[0]//10], 
}

In [None]:
iterations = 1

for i in params:
    iterations *= len(params[i])

In [None]:
params = cross_validate(params)

In [None]:
best_params = None
best_fbeta = 0

In [None]:
for i in range(iterations):
    param = next(params)
    net, optimizer, fbeta = custom_nn(param)

    if fbeta > best_fbeta:
        best_fbeta = fbeta
        best_params = param
        print(f"{best_fbeta:0.2f}")

In [None]:
best_params

In [None]:
n_neruons = [input_neruons]

for i in range(1):
    for g in [36, 36, 18]:
        n_neruons.append(g)
n_neruons.append(1)

In [None]:
net_layers = []

for i in range(1, len(n_neruons) - 1):
    if i == 1 or i % 2 != 0:
        net_layers.append(nn.Linear(n_neruons[i - 1], n_neruons[i]))
        net_layers.append(nn.LeakyReLU(0.2))   
    else:
        net_layers.append(nn.Dropout(0.25))
        net_layers.append(nn.Linear(n_neruons[i - 1], n_neruons[i]))
        net_layers.append(nn.LeakyReLU(0.2))

net_layers.append(nn.Linear(n_neruons[-2], n_neruons[-1]))
net_layers.append(nn.Sigmoid())

net = nn.Sequential(*net_layers).to(dtype=torch.float64, device=device)

In [None]:
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
loss = nn.BCELoss().to(device=device)

batch_size = 7581
epochs = 1500

num_batches = ceil(len(X_train)/batch_size)

best_fbeta = 0
best_optimizer = None
best_net = None

accs = []
epoch_ = []

# training_model
for epoch in tqdm.tqdm(range(epochs)):
    order = np.random.permutation(len(X_train))
    for batch_idx in range(num_batches):
        start_index = batch_idx * batch_size
        optimizer.zero_grad()

        batch_indexes = order[start_index:start_index+batch_size]
        X_batch = X_train[batch_indexes]
        y_batch = y_train[batch_indexes]

        preds = net(X_batch).flatten()
            
        loss_value = loss(preds, y_batch)

        loss_value.backward()
            
        optimizer.step()
            
        if epoch % 10 == 0 or epoch == epochs - 1:
            net.eval()
            test_preds = net(X_val)
            fbeta = fbeta_score(test_preds.to("cpu").detach().numpy().round(), y_val.to("cpu").detach().numpy(), beta=2)
            accs.append(fbeta)
            epoch_.append(epoch)

            if fbeta > best_fbeta:
                best_f1 = fbeta
                best_optimizer = optimizer
                best_net = net

In [None]:
plt.plot(epoch_, accs)
plt.title("FbetaLoss")

In [None]:
nn_prediction = best_net(X_val).to("cpu").detach().numpy().round()
y_true = y_val.to("cpu").detach().numpy()

In [None]:
print(f"""Fbeta_score = {fbeta_score(y_true, nn_prediction, beta=2):0.2f}
Prescision = {precision_score(y_true, nn_prediction,):0.2f}
Recall = {recall_score(y_true, nn_prediction,):0.2f}""")

In [None]:
test_preds = rfc.predict(X_test)

In [None]:
print(f"""Fbeta_score = {fbeta_score(y_test, test_preds, beta=2):0.2f}
Prescision = {precision_score(y_test, test_preds,):0.2f}
Recall = {recall_score(y_test, test_preds,):0.2f}""")

In [None]:
def display_confusion_matrix(model, title, y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tp = cm[0][0]
    fp = cm[0][1]
    fn = cm[1][0] 
    tn = cm[1][1]  

    disp = ConfusionMatrixDisplay(
        confusion_matrix=cm, 
        display_labels=model.classes_
    )

    disp.plot()

    plt.title(title)
    plt.show()

    # Выделение Precision и Recall
    print('Precision =', round(tp / (tp + fp), 3))
    print('Recall = ', round(tp / (tp + fn), 3))

In [None]:
display_confusion_matrix(catboost, y_pred=catboost_prediction, title='Матрица ошибок Catboost', y_test=y_val.to("cpu").detach().numpy())

In [None]:
predict_proba = pd.DataFrame(catboost.predict_proba(X_val.to("cpu").detach().numpy(),))

In [None]:
precision, recall, thresholds = precision_recall_curve(y_val.to("cpu").detach().numpy(), predict_proba[1])

In [None]:
plt.plot(precision, recall)
plt.title("Precision-Recall Catboost")
plt.show()

In [None]:
display_confusion_matrix(catboost, y_pred=nn_prediction, title='Матрица ошибок NN', y_test=y_true)

In [None]:
nn_prediction = best_net(X_val).to("cpu").detach().numpy()

In [None]:
precision, recall, thresholds = precision_recall_curve( y_true, nn_prediction)

In [None]:
plt.plot(precision, recall)
plt.title("Precision-Recall NN")
plt.show()

In [None]:
display_confusion_matrix(rfc, y_pred=rfc_prediction, title='Матрица ошибок LogisticRegression', y_test=y_val.to("cpu").detach().numpy())

In [None]:
predict_proba = pd.DataFrame(rfc.predict_proba(X_val.to("cpu").detach().numpy()))[1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_true, predict_proba)

In [None]:
plt.plot(precision, recall)
plt.title("Precision-Recall LogisticRegression")
plt.show()

In [None]:
importances = rfc.feature_importances_
importances = pd.Series(importances, parameters_df.drop('at_fault', axis=1).columns.values)

plt.figure(figsize=(10, 7))

importances.plot(kind='bar', grid=True, color='m')

plt.title('Важность факторов', fontsize=16)
plt.xticks(fontsize=12)
plt.xlabel('Факторы', fontsize=14)
plt.yticks(fontsize=12)
plt.ylabel('Степень значимости', fontsize=14)
plt.show()

In [None]:
phik_overview = parameters_df[['at_fault', 'hour']].phik_matrix()
phik_overview.round(2)

plot_correlation_matrix(
    phik_overview.values,
    x_labels=phik_overview.columns,
    y_labels=phik_overview.index,
    vmin=0,
    vmax=1,
    figsize=(7, 5)
)

plt.title('''График зависимости
          времени (hour)
          и
          Виновности участника (at_fault)''',
         fontsize=14)
plt.show()

In [None]:
phik_overview = parameters_df[['at_fault', 'vehicle_type']].phik_matrix()
phik_overview.round(2)

plot_correlation_matrix(
    phik_overview.values,
    x_labels=phik_overview.columns,
    y_labels=phik_overview.index,
    vmin=0,
    vmax=1,
    figsize=(7, 5)
)

plt.title('График зависимости\nTипа кузова (vehicle_type)\nи\nВиновности участника (at_fault)',
         fontsize=14)
plt.show()