In [88]:
import pandas as pd
import glob
import os
import tqdm
import gc
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import seaborn as sns
import numpy as np
from scipy.spatial import Voronoi, voronoi_plot_2d
import math
from sklearn import datasets, linear_model, model_selection
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from tqdm import tqdm


##Для отображения плотов на лету
%matplotlib inline

In [89]:
path = 'data/athletes_sochi.txt'
dfs = pd.read_csv(path).dropna()

In [90]:
dfs["BMI"] = dfs["weight"] / (dfs["height"]**2)

In [91]:
dfs

Unnamed: 0,age,birthdate,gender,height,name,weight,gold_medals,silver_medals,bronze_medals,total_medals,sport,country,BMI
0,17,1996-04-12,Male,1.72,Aaron Blunck,68.0,0,0,0,0,Freestyle Skiing,United States,22.985398
1,27,1986-05-14,Male,1.85,Aaron March,85.0,0,0,0,0,Snowboard,Italy,24.835646
2,21,1992-06-30,Male,1.78,Abzal Azhgaliyev,68.0,0,0,0,0,Short Track,Kazakhstan,21.461937
4,21,1992-07-30,Male,1.86,Adam Barwood,82.0,0,0,0,0,Alpine Skiing,New Zealand,23.702162
5,21,1992-12-18,Male,1.75,Adam Cieslar,57.0,0,0,0,0,Nordic Combined,Poland,18.612245
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2851,28,1985-04-30,Male,1.93,Ziga Pavlin,98.0,0,0,0,0,Ice Hockey,Slovenia,26.309431
2853,31,1982-12-05,Female,1.70,Zina Kocher,60.0,0,0,0,0,Biathlon,Canada,20.761246
2854,28,1985-06-14,Female,1.68,Zoe Gillings,65.0,0,0,0,0,Snowboard,Great Britain,23.030045
2856,22,1991-03-01,Male,1.76,Zongyang Jia,68.0,0,0,1,1,Freestyle Skiing,China,21.952479


In [118]:
def get_metrics_by_lin(src: pd.DataFrame, factors: list, target: str) -> dict:
    df = src.sample(frac=1)
        
    inp = df[factors]
    out = df[target]

    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(inp, out, test_size=0.2)
    
    model = linear_model.LogisticRegression()
    model.fit(X_train, Y_train)
    
    Y_test_predicted = model.predict(X_test)
    
    [tn, fp], [fn, tp] = confusion_matrix(Y_test, Y_test_predicted)
    confusion_matrix(Y_test, Y_test_predicted)
    
    precision = tp / (tp + fp)
    recall = tp / tp + fn
    
    return {
        "Precision": tp / (tp + fp),
        "Recall": tp / (tp + fn),
        "Accuracy": (tn + tp) / (tp + fp + tn + fn),
        "F1": (precision * recall) / ((precision) + recall)
    }

In [119]:
# dfs
prepared_data = dfs.copy()
prepared_data["gender"].apply(lambda x: int(x == "Male"))
get_metrics_by_lin(prepared_data, ['weight', 'height', 'BMI'], 'gender')

{'Precision': 0.896774193548387,
 'Recall': 0.9205298013245033,
 'Accuracy': 0.8870967741935484,
 'F1': 0.8657199800697557}

In [120]:
f1_by_param = dict.fromkeys([
    ('weight', 'height', 'BMI'), 
    ('weight', 'height', 'BMI', 'gold_medals'),
    ('weight', 'height', 'BMI', 'gold_medals', 'total_medals'),
    ('weight', 'height', 'BMI', 'total_medals')
])

best_param = None
for param in f1_by_param:
    f1 = get_metrics_by_lin(prepared_data, list(param), 'gender')['F1']
    f1_by_param[param] = f1
    if not best_param or f1 > f1_by_param[best_param]:
        best_param = param
print(f'best_param {best_param}')
f1_by_param

best_param ('weight', 'height', 'BMI', 'gold_medals', 'total_medals')


{('weight', 'height', 'BMI'): 0.876410888741345,
 ('weight', 'height', 'BMI', 'gold_medals'): 0.8954434499593166,
 ('weight',
  'height',
  'BMI',
  'gold_medals',
  'total_medals'): 0.8992588525940158,
 ('weight', 'height', 'BMI', 'total_medals'): 0.8792294393677451}