Wczytanie bibliotek

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from sklearn import grid_search
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
import warnings

Ignorowanie ostrzeżeń

In [2]:
warnings.filterwarnings('ignore')

Wczytanie danych

In [3]:
df = pd.read_csv('all_summary.txt', sep=";", na_values=["NA","NaN","nan", "n/a", ""], low_memory=False).dropna(thresh=1)
df = df[df.res_name.notnull()]

Filtrowanie danych

In [4]:
df = df[~df["res_name"].isin(['DA','DC','DT', 'DU', 'DG', 'DI','UNK', 'UNX', 'UNL', 'PR', 'PD', 'Y1', 'EU', 'N', '15P', 'UQ', 'PX4', 'NAN'])]
df = df.drop_duplicates(subset = ['pdb_code', 'res_name'])

Usuwanie zbędnych danych

In [5]:
values = df.res_name.value_counts().where(lambda x : x > 5).dropna()
df = df[df["res_name"].isin(values.index.get_values())]

Przygotowanie danych pod klasyfikację

In [6]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32']
X = df.select_dtypes(include=numerics)
Y = df["res_name"]

Klasyfikacja [res_name]

In [7]:
clf = KNeighborsClassifier(n_neighbors=7)
param = dict(n_neighbors=range(1, 20))
result1 = grid_search.GridSearchCV(clf, param_grid=param, scoring = 'precision')
result1.fit(X,Y)
print(result1.grid_scores_)

[mean: 0.10786, std: 0.00639, params: {'n_neighbors': 1}, mean: 0.10099, std: 0.01094, params: {'n_neighbors': 2}, mean: 0.09018, std: 0.00457, params: {'n_neighbors': 3}, mean: 0.09043, std: 0.00677, params: {'n_neighbors': 4}, mean: 0.08456, std: 0.00659, params: {'n_neighbors': 5}, mean: 0.08106, std: 0.00700, params: {'n_neighbors': 6}, mean: 0.07853, std: 0.00670, params: {'n_neighbors': 7}, mean: 0.07791, std: 0.00494, params: {'n_neighbors': 8}, mean: 0.07256, std: 0.00362, params: {'n_neighbors': 9}, mean: 0.07071, std: 0.00378, params: {'n_neighbors': 10}, mean: 0.06843, std: 0.00116, params: {'n_neighbors': 11}, mean: 0.06443, std: 0.00073, params: {'n_neighbors': 12}, mean: 0.06685, std: 0.00348, params: {'n_neighbors': 13}, mean: 0.06629, std: 0.00443, params: {'n_neighbors': 14}, mean: 0.06705, std: 0.00190, params: {'n_neighbors': 15}, mean: 0.06798, std: 0.00179, params: {'n_neighbors': 16}, mean: 0.06457, std: 0.00133, params: {'n_neighbors': 17}, mean: 0.06587, std: 0.

Wczytywanie grup

In [8]:
groups = pd.read_csv('grouped_res_name.txt', sep=",", na_values=["NA","NaN","nan", "n/a", ""], low_memory=False).dropna(thresh=1)
cols = ['nr', 'res_name_group']
groups.columns = cols
df['group'] = np.asarray(groups["res_name_group"], dtype="|S6")[:df.shape[0]]
X = df.select_dtypes(include=numerics)
Y = df["group"]

Klasyfikacja [group]

In [9]:
result2 = grid_search.GridSearchCV(clf, param_grid=param, scoring = 'precision')
result2.fit(X,Y)
print(result2.grid_scores_)

[mean: 0.04285, std: 0.00275, params: {'n_neighbors': 1}, mean: 0.04879, std: 0.00313, params: {'n_neighbors': 2}, mean: 0.04853, std: 0.00501, params: {'n_neighbors': 3}, mean: 0.04842, std: 0.00211, params: {'n_neighbors': 4}, mean: 0.04777, std: 0.00054, params: {'n_neighbors': 5}, mean: 0.04743, std: 0.00190, params: {'n_neighbors': 6}, mean: 0.04314, std: 0.00369, params: {'n_neighbors': 7}, mean: 0.04350, std: 0.00194, params: {'n_neighbors': 8}, mean: 0.04450, std: 0.00186, params: {'n_neighbors': 9}, mean: 0.04343, std: 0.00162, params: {'n_neighbors': 10}, mean: 0.04314, std: 0.00095, params: {'n_neighbors': 11}, mean: 0.04320, std: 0.00040, params: {'n_neighbors': 12}, mean: 0.04264, std: 0.00163, params: {'n_neighbors': 13}, mean: 0.04383, std: 0.00166, params: {'n_neighbors': 14}, mean: 0.04280, std: 0.00151, params: {'n_neighbors': 15}, mean: 0.04254, std: 0.00045, params: {'n_neighbors': 16}, mean: 0.04302, std: 0.00131, params: {'n_neighbors': 17}, mean: 0.04336, std: 0.

Zapisywanie do pliku

In [10]:
joblib.dump(result1.grid_scores_, 'name_results.pkl')
joblib.dump(result2.grid_scores_, 'group_results.pkl')

['group_results.pkl',
 'group_results.pkl_01.npy',
 'group_results.pkl_02.npy',
 'group_results.pkl_03.npy',
 'group_results.pkl_04.npy',
 'group_results.pkl_05.npy',
 'group_results.pkl_06.npy',
 'group_results.pkl_07.npy',
 'group_results.pkl_08.npy',
 'group_results.pkl_09.npy',
 'group_results.pkl_10.npy',
 'group_results.pkl_11.npy',
 'group_results.pkl_12.npy',
 'group_results.pkl_13.npy',
 'group_results.pkl_14.npy',
 'group_results.pkl_15.npy',
 'group_results.pkl_16.npy',
 'group_results.pkl_17.npy',
 'group_results.pkl_18.npy',
 'group_results.pkl_19.npy']