In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_table('Skin_NonSkin.txt', header=None)
data = data.rename(index=str, columns={0: "0", 1: "1", 2: "2", 3:"3"})

In [3]:
#первые 10 строчек
data.head(10)

Unnamed: 0,0,1,2,3
0,74,85,123,1
1,73,84,122,1
2,72,83,121,1
3,70,81,119,1
4,70,81,119,1
5,69,80,118,1
6,70,81,119,1
7,70,81,119,1
8,76,87,125,1
9,76,87,125,1


In [4]:
#второй столбец
data["0"].head(10)

0    74
1    73
2    72
3    70
4    70
5    69
6    70
7    70
8    76
9    76
Name: 0, dtype: int64

In [5]:
#2-я строчка
data.loc['1']

0     73
1     84
2    122
3      1
Name: 1, dtype: int64

In [13]:
#какое то условие для определенного признака
data[data['3'] == 2]

Unnamed: 0,0,1,2,3
50859,198,198,158,2
50860,198,198,158,2
50861,198,198,158,2
50862,198,198,158,2
50863,198,198,158,2
50864,198,198,158,2
50865,198,198,158,2
50866,198,198,158,2
50867,198,198,158,2
50868,198,198,158,2


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1
], test_size=0.3, random_state=0)

In [8]:
#Сделаем предсказния разными методами
methods = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]
for method in methods:
    method.fit(X_train, y_train)
    y_predict = method.predict(X_test)
    acc = sum(y_test == y_predict)/len(y_predict)
    print(f'Метод: {str(method)}.\n Качество предсказания: {acc}.\n')

Метод: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False).
 Качество предсказания: 0.9186457738240975.

Метод: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False).
 Качество предсказания: 0.9994287113359993.

Метод: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform').
 Качество предсказания: 0.9994967218

In [48]:
#преобразования для признаков
for i in range(4):
    for j in range(i+1,4):
        data[f"|{i} - {j}|"] = abs(data[f'{i}'] - data[f'{j}'])
        data[f"|{i} * {j}|"] = data[f'{i}'] * data[f'{j}']
data.head(10)

Unnamed: 0,0,1,2,3,|0 - 1|,|0 * 1|,|0 - 2|,|0 * 2|,|0 - 3|,|0 * 3|,|1 - 2|,|1 * 2|,|1 - 3|,|1 * 3|,|2 - 3|,|2 * 3|
0,74,85,123,1,11,6290,49,9102,73,74,38,10455,84,85,122,123
1,73,84,122,1,11,6132,49,8906,72,73,38,10248,83,84,121,122
2,72,83,121,1,11,5976,49,8712,71,72,38,10043,82,83,120,121
3,70,81,119,1,11,5670,49,8330,69,70,38,9639,80,81,118,119
4,70,81,119,1,11,5670,49,8330,69,70,38,9639,80,81,118,119
5,69,80,118,1,11,5520,49,8142,68,69,38,9440,79,80,117,118
6,70,81,119,1,11,5670,49,8330,69,70,38,9639,80,81,118,119
7,70,81,119,1,11,5670,49,8330,69,70,38,9639,80,81,118,119
8,76,87,125,1,11,6612,49,9500,75,76,38,10875,86,87,124,125
9,76,87,125,1,11,6612,49,9500,75,76,38,10875,86,87,124,125


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, data.columns != 3], data.iloc[:, 3
], test_size=0.3, random_state=0)

In [12]:
methods = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]
for method in methods:
    method.fit(X_train, y_train)
    y_predict = method.predict(X_test)
    acc = sum(y_test == y_predict)/len(y_predict)
    print(f'Метод: {str(method)}.\n Качество предсказания: {acc}.\n')

Метод: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False).
 Качество предсказания: 1.0.

Метод: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False).
 Качество предсказания: 1.0.

Метод: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform').
 Качество предсказания: 0.9994967218912375.



Преобразования помогли нам делать более точные предсказания.