In [2]:
import igraph 
from sklearn.cross_validation import StratifiedKFold
import pandas as pd
from sklearn.grid_search import ParameterGrid
from sklearn.base import BaseEstimator
import numpy as np
import sklearn.cluster
import sys
sys.path.insert(0, "/home/nex/Dropbox (GMUM)/ujDOK1/SCFP/mol2vec//mol2vec")
from training_data.datasets import CVBaseChemDataset, BaseChemDataset
import scipy


In [2]:
# similarity - similarity array
# predicted - predicted clusters
def stagnieszko_score(similarity, predicted):
    assert similarity.shape[0]==predicted.shape[0]
    assert similarity.shape[1]==predicted.shape[0]
    
    thresholds = np.unique(similarity)
    known = zip(*similarity.nonzero()) # to nie jest prawda! bo może być znany i mieć 0! reprezentacja!
    best = 0
    
    for threshold in thresholds:
        score = 0
        n_checked = 0
        for a, b in known:
            n_checked+=1
            if a >= b: # dzięki temu sprawdzamy każdą parę jeden raz
                continue
            if similarity[a, b] >= threshold and predicted[a] == predicted[b]:
                score += 1
            elif similarity[a, b] < threshold and predicted[a] != predicted[b]:
                score += 1
        if score > best:
            best = score
    
    # wszystkie pary, które wylądowały w dobrym klastrze przez liczba par (bez par związek sam ze sobą)
    # sprawdzić, czy przyjmuje wartości z przedziału [0, 1]
    return (2.*best)/n_checked

# problem: co jak known is None? ZAPEWNIC ZE SIE NIE ZDARZY

In [3]:
def load_miu(file_name, seed):
    np.random.seed(seed)
    miu = scipy.sparse.csr_matrix((n_samples, n_samples))
    # diagonal can be done better with identity
    for i in xrange(n_samples):
        miu[i,i] = 1
    for i in xrange(int(0.1*n_samples)):
        a = np.random.randint(0, n_samples)
        b = np.random.randint(0, n_samples)
        if a!=b:
            miu[a, b] = np.random.rand()
            miu[b, a] = np.random.rand()
    return miu
# loads miu defining its final structure based on chembls and X

In [4]:
# data_X - samples to be clustered
# model , model_params - BaseEstimator, ParameterGrid
# miu - array with similarities
# threshold - how similar must be two compounds to be similar
# seed - seed for experiment
def run_experiment(data_X, model, model_params, param_types, miu, seed):    
    assert isinstance(model_params, ParameterGrid)
    assert isinstance(model, BaseEstimator)
    
    np.random.seed(seed) # setting the seed for numpy.random methods DO I EVEN USE IT?
    n_folds = 3
    hyperparams_names = sorted(param_types.keys())
    
    data_X = data_X.todense()
    test_precentage = 0.1  # one tenth will be for final testing TO BE CHANGED
    test_start_index = test_precentage*data_X.shape[0]
    test_X = data_X[test_start_index:,:]
    # can't do so with miu OR CAN?
    miu = np.array(miu.todense())
    miu_train_val = miu[:test_start_index, :test_start_index]
    miu_test = miu[test_start_index:, test_start_index:]
    
    # createnumpy temp array for this experiment
    n_param_sets = len(list(model_params))
    n_hyperparams = len(list(model_params)[0].keys())
    results = np.ndarray((n_folds*n_param_sets, n_hyperparams+2))
    # columns are: all parameters along with their names + fold number + score 
    
    idx = -1
    fold = -1
    # tu powinien byc jakis podzial najlepszy wyznaczony
    skf = StratifiedKFold(data_y[:test_start_index], n_folds=n_folds, shuffle=False, random_state=seed)
    for tr_idx, val_idx in skf:
        fold += 1
        not_val_X = data_X[tr_idx]
        val_X = data_X[val_idx]
        tr_X = np.vstack((not_val_X, val_X))
        miu_val = miu[val_idx]
        miu_val = miu_val[:, val_idx]
        
        for params in list(model_params):
            mod = model.set_params(**params)
            mod.fit(tr_X)
            predictions = mod.predict(val_X)
            score = stagnieszko_score(miu_val, predictions)
            
            # saving results to an array
            idx+=1         
            temp = [params[key] for key in hyperparams_names]
            temp.append(fold)
            temp.append(score)
            results[idx] = tuple(temp)
            
    # pandise and save the cross validation results
    cols = list(hyperparams_names)  # list() on list to have a copy
    cols.append('fold')
    cols.append('score')
    df = pd.DataFrame(data=results, columns=cols)
    for key in param_types.keys():
        df[key] = df[key].astype(param_types[key])
    
    df.to_csv('cv_results')
    print df
           
    # averaging over folds
    df2 = df.groupby(hyperparams_names).mean()
    # choosing model that performed best
    df3 = df2[df2['score'] == df2['score'].max()]
    print df3
    best_params = dict(zip(hyperparams_names, df3.index.tolist()[0]))
    print 'best params are', best_params

    # training final model
    final_model = model.set_params(**best_params)
    final_model.fit(data_X)
    preds = final_model.predict(test_X)
    final_score = stagnieszko_score(miu_test, preds)
    
    cols = list(hyperparams_names)
    final_results = [best_params[key] for key in cols]
    cols.append('score')
    final_results.append(final_score)
    final_df = pd.DataFrame.from_items(zip(cols, [np.array([item]) for item in final_results]))
    print final_df
    final_df.to_csv('final_results')
    return final_df


In [5]:
(data_X, data_y), _ = BaseChemDataset(representation="KR", compound='5-HT1a', valid_size=0.0).get_data()
n_samples = data_X.shape[0]
model = sklearn.cluster.KMeans()
params = ParameterGrid({'n_clusters':[4, 5], 'max_iter':[100, 200]})
param_types = {'n_clusters': 'int', 'max_iter':'int'}

seed = 43
np.random.seed(seed)

miu = load_miu('miu.libsvm or any other format please', seed)
threshold = .5 # what do we do with thresholding?

df = run_experiment(data_X, model, params, param_types, miu, seed)



    max_iter  n_clusters  fold     score
0        100           4   0.0  0.000000
1        100           5   0.0  0.000000
2        200           4   0.0  0.000000
3        200           5   0.0  0.000000
4        100           4   1.0  0.011173
5        100           5   1.0  0.011173
6        200           4   1.0  0.011173
7        200           5   1.0  0.011173
8        100           4   2.0  0.000000
9        100           5   2.0  0.000000
10       200           4   2.0  0.000000
11       200           5   2.0  0.000000
                     fold     score
max_iter n_clusters                
100      4            1.0  0.003724
         5            1.0  0.003724
200      4            1.0  0.003724
         5            1.0  0.003724
best params are {'max_iter': 100, 'n_clusters': 4}
   max_iter  n_clusters     score
0       100           4  0.090281


In [6]:
print n_samples

5321


In [7]:
df2.index.tolist()

NameError: name 'df2' is not defined

In [None]:
df2.iloc[0]

In [None]:
print df
hn = ['max_iter', 'n_clusters']
df2 = df.groupby(hn).mean()
# choosing model that performed best
df3 = df2[df2['score'] == df2['score'].max()]
print dict(zip(hn, df3.index.tolist()[0]))

In [None]:
a = df['score']

In [None]:
a = df['score']

In [None]:
print a

In [None]:
print miu