##### Imports

In [6]:
# operations with data
import pandas as pd
from sklearn.model_selection import train_test_split
# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# ensembles
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
# metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
# visualization
import seaborn as sns
import matplotlib as plt
# preprocessing
import numpy as np

#### Loading dataset

In [2]:
dataset = pd.read_csv('smoking.csv')
dataset.shape

(55692, 27)

#### Explotary Analysis

#### Creating new hypotheses

#### Preprocessing

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55692 entries, 0 to 55691
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   55692 non-null  int64  
 1   gender               55692 non-null  object 
 2   age                  55692 non-null  int64  
 3   height(cm)           55692 non-null  int64  
 4   weight(kg)           55692 non-null  int64  
 5   waist(cm)            55692 non-null  float64
 6   eyesight(left)       55692 non-null  float64
 7   eyesight(right)      55692 non-null  float64
 8   hearing(left)        55692 non-null  float64
 9   hearing(right)       55692 non-null  float64
 10  systolic             55692 non-null  float64
 11  relaxation           55692 non-null  float64
 12  fasting blood sugar  55692 non-null  float64
 13  Cholesterol          55692 non-null  float64
 14  triglyceride         55692 non-null  float64
 15  HDL                  55692 non-null 

In [4]:
#### Binary Label Encoder
dataset['gender'] = np.where(dataset['gender'] == 'F', np.int64(1), np.int64(0))
dataset['oral'] = np.where(dataset['oral'] == 'Y', np.int64(1), np.int64(0))
dataset['tartar'] = np.where(dataset['tartar'] == 'Y', np.int64(1), np.int64(0))

#### Hypertuning 

In [122]:
#### Некоторые переменные для гипертюнинга, по идее можно что-то добавить, но это будет пиздец)))))))) Хотя почему бы и нет...
splits = [0.2, 0.3, 0.4]
coeffs_svc = [1, 5, 10, 15]
knn_n = [5, 10, 15, 20, 25]
weigths_knn = ['uniform', 'distance']
algo_knn = ['ball_tree', 'kd_tree', 'brute', 'auto']
p_knn = [1, 2]
metric_knn = ['minkowski', 'euclidean']
tree_criteria = ['gini', 'entropy']
split_tree = ['best', 'random']
tree_max_depth = [10, 20, 30, 40, None]
estimator_bagging = [5, 10, 15, 20]



In [121]:
#### Тут надо подумать о том какие фичи будут участвовать в обучении моделей................
# del dataset['ID']
# X = dataset
# Y = dataset['smoking']
# del X['smoking']

In [28]:
bagging_results = []
for i in splits:
    for j in estimator_bagging:
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=i)
        tune_model = BaggingClassifier(n_estimators=j)
        tune_model.fit(X_train, y_train)
        bagging_results.append([i, j, f1_score(y_true = y_test, y_pred=tune_model.predict(X_test)), accuracy_score(y_true = y_test, y_pred=tune_model.predict(X_test))])

In [29]:
tree_results = []
for i in splits:
    for j in tree_criteria:
        for k in split_tree:
            for z in tree_max_depth:
                X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=i)
                tune_model = DecisionTreeClassifier(criterion=j, splitter=k, max_depth=z)
                tune_model.fit(X_train, y_train)
                tree_results.append([i, j, k, z, f1_score(y_true = y_test, y_pred=tune_model.predict(X_test)), accuracy_score(y_true = y_test, y_pred=tune_model.predict(X_test))])

In [30]:
knn_results = []
for i in splits:
    for j in knn_n:
        for k in weigths_knn:
            for p in algo_knn:
                for z in p_knn:
                    for m in metric_knn:
                        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=i)
                        tune_model = KNeighborsClassifier(n_neighbors=j, weights=k, algorithm=p, p=z, metric=m)
                        tune_model.fit(X_train, y_train)
                        knn_results.append([i, j, k, p, z, m, f1_score(y_true = y_test, y_pred=tune_model.predict(X_test)), accuracy_score(y_true = y_test, y_pred=tune_model.predict(X_test))])
        

In [31]:
svc_results = []
for i in splits:
    for j in coeffs_svc:
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=i)
        tune_model = SVC(C= j)
        tune_model.fit(X_train, y_train)
        svc_results.append([i, j, f1_score(y_true = y_test, y_pred=tune_model.predict(X_test)), accuracy_score(y_true = y_test, y_pred=tune_model.predict(X_test))])
        

In [34]:
knn_results[-1]

[0.2, 20, 'distance', 'kd_tree', 1, 'minkowski', 0.8114732022623216]

In [38]:
sorted(svc_results, key=lambda x: x[2], reverse=True)[0]

[0.2, 10, 0.7492593590088877]

In [39]:
sorted(tree_results, key=lambda x: x[4], reverse=True)[0]

[0.2, 'entropy', 'best', None, 0.7936978184756261]

In [40]:
sorted(bagging_results, key=lambda x: x[2], reverse=True)[0]

[0.2, 15, 0.8191938235030074]

#### Models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
svc_model = SVC(C=10)
knn_model = KNeighborsClassifier(n_neighbors=20, weights='distance', algorithm='kd_tree', p=1, metric='minkowski')
tree_model = DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=None)
bagging_model = BaggingClassifier(n_estimators=15)
boost_model = AdaBoostClassifier()

#### Verifying hypothesis

In [5]:
X = dataset[['eyesight(left)', 'eyesight(right)', 'dental caries', 'tartar']]
y = dataset['smoking']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Здесь дальше идёт код лучшей модели, на данный момент лучшая Bagging(Adaboost не трогал, так как хз ч там трогать)
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=None),n_estimators=15)
bagging_model.fit(X_train, y_train)
predicted_X = bagging_model.predict(X_test)
score = f1_score(y_true=y_test, y_pred=predicted_X)
print('F1 measure:',score)

F1 measure: 0.2607428987618354


#### Evaluting results

In [None]:
##### Здесь сравнение моделей


#### Collecting Results & Submission

In [None]:
##### Сборка результирующих csv-шников с результатами проверки гипотез из лучшей модели


result_df_example.to_csv('hypothesis_name.csv')