In [2]:
from src.data_reader import Data_reader
from src.feature_provider import Feature_provider
from src.data_reader import get_labels

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import scipy
import time

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [191]:
conf = {'file_path': 'data/dorothea_train.data'}
reader = Data_reader(conf)
feature_provider = Feature_provider(reader.get_matrix())


conf = {'file_path': 'data/dorothea_valid.data'}
reader = Data_reader(conf)
valid_feature_provider = Feature_provider(reader.get_matrix())



y_train = get_labels('data/dorothea_train.labels')
y_val = get_labels('data/dorothea_valid.labels')

def transform_labels(x):
    return [int((a+1)/2) for a in x]

y_train = transform_labels(y_train)
y_val = transform_labels(y_val)

In [196]:
licznosci = feature_provider.data.tocsc().sum(axis = 2)

indexy = np.squeeze(np.argwhere(np.squeeze(np.array(licznosci)) > 30.)).tolist()


feature_provider = Feature_provider(feature_provider.data[:, indexy])
valid_feature_provider = Feature_provider(valid_feature_provider.data[:, indexy])

In [197]:
num_features = feature_provider.data.shape[1]

In [198]:
def eval_feature_set(features):
    X_train = feature_provider.get_slice(features, sparse_output = True)
    
    X_val = valid_feature_provider.get_slice(features, sparse_output = True)
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    return accuracy_score(y_val, model.predict(X_val))

In [199]:
beg = time.time()
for i in range(100):
    eval_feature_set(list(range(1000)))
print(time.time() - beg)

1.621340036392212


In [200]:
mat = reader.get_matrix()

In [201]:
def mate(x, y):
    shuffled = x.arr.copy()
    np.random.shuffle(shuffled)
    mask = np.where(np.random.random(size = x.arr.shape) > .5, 1, 0)
    
    new = mask*shuffled + (1-mask)*y.arr
    new2 = (1-mask)*shuffled + mask*y.arr
    return ind(new), ind(new2)
    

In [202]:

def mutate(x :np.ndarray, mut_proba = 0.1):
    new_features = np.random.randint(num_features, size = x.arr.shape)
    
    mask = np.where(np.random.random(size = x.arr.shape) > mut_proba, 1, 0)
    return ind(mask*x.arr + (1-mask)*new_features)

In [203]:
class ind:
    def __init__(self, arr):
        self.arr = arr
        self.fitness = 0
        
    def __mul__(self, other):
        self.arr *= other
        return self
    
    def __rmul__(self, other):
        self.arr *=other
        return self

In [204]:
def create_population(population_size, individual_size):
    pop = []
    for i in range(population_size):
        g = np.random.randint(num_features, size = individual_size)
        pop.append(ind(g))
    return pop

In [205]:
def select_k_best(population, k):
    assert(len(population) >= k)
    return sorted(population, key = lambda x: x.fitness, reverse = True)[:k]

In [206]:
def evaluate_pop(pop, eval_func):
    for i in pop:
        i.fitness = eval_func(i.arr)

In [207]:
def mate_pop(pop):
    # uwaga! shuffluje nam liste
    np.random.shuffle(pop)
    new_pop = []
    for i in range(len(pop)//2):
        n1, n2 = mate(pop[i], pop[-1])
        new_pop.append(n1)
        new_pop.append(n2)
        
    if(len(pop) % 2 == 0):
        new_pop.append(pop[len(pop)//2 + 1])
    
    return new_pop
        

In [208]:
def mutate_pop(pop, proba):
    mask = np.where(np.random.random(size = len(pop)) > proba, 0, 1)
    for i in np.squeeze(np.argwhere(mask == 1)).tolist():
        pop[i] = mutate(pop[i])

In [215]:
beg = time.time()
p = create_population(1000, 70)

for i in range(100):
    evaluate_pop(p, eval_feature_set)
    best_half = select_k_best(p, len(p)//2)
    p = best_half + mate_pop(best_half)
    mutate_pop(p, 0.15)
    
evaluate_pop(p, eval_feature_set)
time.time() - beg

781.3497290611267

In [216]:
evaluate_pop(p, eval_feature_set)

In [217]:
[i.fitness for i in select_k_best(p, 100)]

[0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96,
 0.96]

In [218]:
best10 = select_k_best(p, 10)

In [220]:
set(best10[0].arr) & set(best10[1].arr)

{23, 57, 156, 376, 471, 570, 753, 844, 875, 1179, 1277, 1279, 1536, 1731, 2098}

In [222]:
def get_importances(pop):
    # powiedzmy że zmienna globalna
    n = num_features
    
    arr = np.zeros(n)
    
    for i in pop:
        arr[i.arr] += i.fitness
        
    return arr

In [235]:
importances = get_importances(p)
sorted_indeces = sorted(list(range(len(importances))),key = lambda x: importances[x], reverse = True)

In [255]:
for i in range(30):
    print(i+1, eval_feature_set(sorted_indeces[:i+1]))

1 0.9485714285714286
2 0.9485714285714286
3 0.9514285714285714
4 0.9542857142857143
5 0.9542857142857143
6 0.9542857142857143
7 0.9571428571428572
8 0.9571428571428572
9 0.9571428571428572
10 0.96
11 0.96
12 0.96
13 0.96
14 0.96
15 0.96
16 0.96
17 0.96
18 0.96
19 0.96
20 0.96
21 0.96
22 0.96
23 0.96
24 0.96
25 0.96
26 0.96
27 0.96
28 0.96
29 0.96
30 0.96


In [257]:
eval_feature_set(sorted_indeces[:10])

0.96

In [237]:
from sklearn.metrics import confusion_matrix

In [284]:
clf = LogisticRegression()
clf.fit(feature_provider.get_slice(sorted_indeces[:10]), y_train)

accuracy_score(y_true=y_train, y_pred=clf.predict(feature_provider.get_slice(sorted_indeces[:10]))),\
accuracy_score(y_true=y_val, y_pred=clf.predict(valid_feature_provider.get_slice(sorted_indeces[:10])))

(0.93625, 0.96)