In [15]:
import numpy as np
from sklearn.utils import Bunch
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import random
import pandas as pd

In [20]:
# функция для создания датасета для обучения (выделение фич и таргета)
def load_my_fancy_dataset(df):
    data = []
    target = []
    for i, row in df.iterrows():
        lst = row.tolist()
        features = lst[:-1]
        label = lst[-1]
        data.append([float(num) for num in features])
        target.append(int(label))
    
    data = np.array(data)
    target = np.array(target)
    return Bunch(data=data, target=target, feature_names=df.columns[:-1])


In [17]:
# значения для кодирования тектовой инфы
localization_dict_val = {
    'lower extremity': 0, 
    'torso': 1, 
    'upper extremity': 2, 
    'head/neck': 3,
    'unknown': 4, 
    'palms/soles': 5, 
    'oral/genital': 6, 
    'scalp': 7, 
    'ear': 8, 
    'face': 9,
    'back': 10, 
    'trunk': 11, 
    'chest': 12, 
    'abdomen': 13, 
    'genital': 6, 
    'neck': 14, 
    'hand': 15,
    'foot': 16, 
    'acral': 17
}

In [18]:
# загружаем датасет для обучения
result = pd.read_csv('dataset/merged_for_sklearn.csv')
result.head()

Unnamed: 0,advantage,sex,age,localization,target
0,1.896287,0.0,60,11,1
1,2.1562,1.0,40,1,0
2,1.93657,0.0,45,1,0
3,0.86949,0.0,50,9,0
4,2.413886,1.0,43,2,1


In [19]:
# перемешиваем строки
from sklearn.utils import shuffle
result = shuffle(result)

In [21]:
dataset = load_my_fancy_dataset(result) # готовим датасет для обучения

In [22]:
# предпроцессим данные
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=2)

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

# обучаем на GradientBoostingClassifier модели

gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, max_depth=10, random_state=42)
gb.fit(X_train, y_train)

In [24]:
y_pred_gb = gb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Accuracy: 0.8722970855531181
              precision    recall  f1-score   support

           0       0.85      0.93      0.89      3511
           1       0.91      0.80      0.85      2871

    accuracy                           0.87      6382
   macro avg       0.88      0.87      0.87      6382
weighted avg       0.88      0.87      0.87      6382



In [None]:
import pickle

# save
with open('models/model_sklearn_1_1.pkl','wb') as f:
    pickle.dump(gb,f)

# load
with open('models/model_sklearn_1_1.pkl', 'rb') as f:
    gb2 = pickle.load(f)

In [12]:
from sklearn.ensemble import RandomForestClassifier

# обучаем на RandomForestClassifier модели

forest = RandomForestClassifier(n_estimators=500, max_depth=100, random_state=42)
forest.fit(X_train, y_train)

y_pred_forest = forest.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_forest))
print(classification_report(y_test, y_pred_forest))

Accuracy: 0.850987151363209
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      3513
           1       0.84      0.82      0.83      2869

    accuracy                           0.85      6382
   macro avg       0.85      0.85      0.85      6382
weighted avg       0.85      0.85      0.85      6382



In [13]:
from sklearn.neighbors import KNeighborsClassifier

# обучаем на KNeighborsClassifier модели

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

Accuracy: 0.8423691632717016
              precision    recall  f1-score   support

           0       0.81      0.93      0.87      3513
           1       0.89      0.74      0.81      2869

    accuracy                           0.84      6382
   macro avg       0.85      0.83      0.84      6382
weighted avg       0.85      0.84      0.84      6382

