In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2
from trainer import train_evaluate

from feature_selection import get_k_best
import json

In [3]:
def get_data(files):
    data_matrix = np.loadtxt(files[0], dtype='i', delimiter='\t')
    data_matrix = data_matrix.T

    last_col = [0] * len(data_matrix)
    data_matrix = np.column_stack((data_matrix, last_col))

    for x in range(len(files) - 1):
        temp_matrix = np.loadtxt(files[x + 1], dtype='i', delimiter='\t')
        temp_matrix = temp_matrix.T
        last_col = [x+1] * len(temp_matrix)
        temp_matrix = np.column_stack((temp_matrix, last_col))
        data_matrix = np.concatenate((data_matrix, temp_matrix), axis=0)

    X = data_matrix[:, :-1]
    Y = data_matrix[:, -1]
    return X, Y

In [4]:
files = glob.glob('dataset/*.txt')
X, Y = get_data(files)

get_k_best(X,Y,10)

array([[  5,  20, 135, ...,   1,   1,   0],
       [  2,  18, 127, ...,   1,   1,   1],
       [  1,  12, 164, ...,   0,   1,   0],
       ...,
       [  3,  15, 191, ...,   0,   0,   1],
       [  6,  19, 208, ...,   0,   1,   0],
       [  1,   1, 191, ...,   1,   1,   0]])

In [9]:
np.unique(Y,return_counts=True)

(array([0, 1, 2, 3, 4]), array([ 68, 142, 230, 263, 198], dtype=int64))

In [4]:
fvalue_selector = SelectKBest(chi2, k=10)
fvalue_selector.fit(X, Y)

rank = fvalue_selector.scores_
top_rank = []
indexes = rank.argsort()[-10:][::-1]

for index in indexes:
    top_rank.append(rank[index])

In [5]:
layer_sizes = [10,20,30]
momentum_values = [0, 0.9]
max_patience = 100
filename = "resultaty5.csv"
rkf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=3)

In [6]:
results = pd.DataFrame(columns=["fold","layer_size","momentum_value","feature_number","score"])
best_score = 0
best_matrix = []    
best_params = {}
for fold, (train, test) in enumerate(rkf.split(X, Y)):
    for layer_size in layer_sizes:
        for momentum_value in momentum_values:
            patience = max_patience   
            temp_best_score = 0
            temp_best_matrix = []
            temp_best_params = {}
            for feature_number in range(1,np.shape(X)[1]+1):
                
                score, matrix = train_evaluate(X=get_k_best(X,Y,feature_number),
                                               Y=Y,
                                               momentum_value=momentum_value,
                                               layer_size=layer_size,
                                               train=train,
                                               test=test)
                params = {"fold":fold,
                          "layer_size":layer_size,
                          "momentum_value":momentum_value,
                          "feature_number":feature_number,
                          "score":score}
                results = results.append(params, ignore_index=True)
                #print(f"{layer_size}\t{momentum_value}\t{feature_number}\t{score}")   
                if score > temp_best_score:
                    temp_best_matrix = matrix
                    temp_best_score = score
                    patience = max_patience
                    temp_best_params = params
                else:
                    patience -= 1
                    if patience == 0:
                        break
            
             
            if temp_best_score > best_score:
                best_matrix = temp_best_matrix
                best_params = temp_best_params
    print(f"koniec_folda{fold}")                

koniec_folda0
koniec_folda1
koniec_folda2
koniec_folda3
koniec_folda4
koniec_folda5
koniec_folda6
koniec_folda7
koniec_folda8
koniec_folda9




In [8]:
with open("best_params4.json",'x') as f:
    json.dump(best_params,f)
np.savetxt("best_matrix4.txt",best_matrix)

In [7]:
results.to_csv(filename)

In [10]:
resultaty = pd.read_csv(filename)