# Imports

In [1]:
import pandas as pd
import random
import itertools
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Genetic Class

In [7]:

class Genetic():
  
    def __init__(self, dataframe, population_size, crossover_prob, mutation_prob):
        self.dataframe = dataframe
        self.num_features = dataframe.shape[1]
        self.population_size = population_size
        self.crossover_prob = crossover_prob
        self.mutation_prob = mutation_prob


    def _initialize(self):
        """ 
        Creating the initial population.
        """
        self.population = []
        for _ in range(self.population_size):
            chromosome = [random.randint(1, self.num_features) for i in range(self.num_features)]
        
        self.population.append(chromosome)
    
    def _fitness(self):
        """
        Calculating the fitness value of each chromosome in the current population.
        The fitness function calculates the information loss using information theory.
        """
        population_fitness = [] 
        for chromosome in self.population:
           
            Cluster_dict = {j: ['x{}'.format(i) for i in range(len(chromosome)) if chromosome[i] == j] 
                    for j in range(1,self.num_features+1)
                   }
           
            list_of_pairs = []
            for i in range(1, self.num_features+1):
                list_of_pairs_i = [pair for pair in itertools.combinations(Cluster_dict[i], r=2)]
               
                if list_of_pairs_i != []:
                    list_of_pairs.append(list_of_pairs_i)
            
            df = self.dataframe.copy()
            pc_x = 1 / df.shape[1]
            pc_star = 2*(pc_x)
            total_infoLoss = 0.0

            num_pairs = 0

            for cls in list_of_pairs:
                for p in cls:
                    
                    df['p_bar'] = ((pc_x / pc_star) * df[p[0]]) + ((pc_x / pc_star) * df[p[1]])
                    df['prt_1_infoLoss'] = pc_x * (df[p[0]] * np.log2(df[p[0]] / df['p_bar']))
                    df['prt_2_infoLoss'] = pc_x * (df[p[1]] * np.log2(df[p[1]] / df['p_bar']))
                    df['prt_1_infoLoss'].fillna(0, inplace=True)
                    df['prt_2_infoLoss'].fillna(0, inplace=True)
                    df['infoLoss'] = df['prt_1_infoLoss'] + df['prt_2_infoLoss']
                    infoLoss = df.loc[:,'infoLoss'].sum(axis = 0)
                   
                    total_infoLoss = total_infoLoss + infoLoss
                    num_pairs += 1

                    
                    df = df.drop(columns=['p_bar', 'prt_1_infoLoss', 'prt_2_infoLoss', 'infoLoss'])
            population_fitness.append(1 / (total_infoLoss / num_pairs))
            
        return population_fitness

    
    def _mutate(self, chromosome):
        """ 
        Flip one bit of chromosome with probability self.mutation_prob
        """
        if np.random.random() < self.mutation_prob:
            mutation_point = np.random.randint(0, len(chromosome))
            chromosome = chromosome[:mutation_point] + [random.randint(1, self.num_features)] + chromosome[mutation_point+1:]
        return chromosome

    def _crossover(self, parent1, parent2):
        """ 
        Create children from parents by single-point crossover 
        """
        if np.random.random() < self.crossover_prob:
           
            crossover_point = np.random.randint(0, len(parent1))
            child1 = parent1[:crossover_point] + parent2[crossover_point:]
            child2 = parent2[:crossover_point] + parent1[crossover_point:]
        else: 
            child1 = parent1
            child2 = parent2
        return child1, child2

    def run(self):
        """ 
        Run function, the main loop of the Genetic Algorithm
        """
        self._initialize()
       
                   
        df_of_best_fitness = pd.DataFrame(columns=['best_fitness', 'fittest_chromosome'], index=range(0,int(self.num_features/3)+1)) 
        for iteration in range(int(self.num_features/50)): 
            population_fitness = self._fitness()
            
            fittest_chromosome = self.population[np.argmax(population_fitness)]
            best_fitness = max(population_fitness)
            

            df_of_best_fitness.loc[iteration].best_fitness = best_fitness
            df_of_best_fitness.loc[iteration].fittest_chromosome = fittest_chromosome


            sum_population_fitness = sum(population_fitness) + 1 
            parent_probs = np.array([sum_population_fitness - fit for fit in population_fitness]) 
            parent_probs = parent_probs / sum(parent_probs)

            new_population = []
            new_population.append(fittest_chromosome) 
            for i in np.arange(1, self.population_size-1, 2):
                idx_parent1, idx_parent2 = np.random.choice(len(self.population), size=2, p=parent_probs, replace=True)
                child1, child2 = self._crossover(self.population[idx_parent1], self.population[idx_parent2])
                child1 = self._mutate(child1)
                child2 = self._mutate(child2)
                new_population += [child1, child2]
            
            print ("Iteration: {}, Best Fitness: {}".format(iteration, best_fitness, fittest_chromosome))
            self.population = new_population
        return df_of_best_fitness


# Reading Data and concate them(from each dataset choosing 200 feature) 

In [8]:
num_earn_text = 50
num_crude_text = 50 
num_money_fx_text = 50
num_grain_text = 50
num_interest_text = 50
num_trade_text = 50
num_ship_text = 50
num_wheat_text = 50
num_corn_text = 50

dataframe_earn = pd.read_table("data/train-earn.dat", sep= ',').iloc[:num_earn_text,:100]
dataframe_crude = pd.read_table("data/train-crude.dat", sep= ',').iloc[:num_crude_text,:100]
dataframe_money_fx = pd.read_table("data/train-money-fx.dat", sep= ',').iloc[:num_money_fx_text,:100]
dataframe_grain = pd.read_table("data/train-grain.dat", sep= ',').iloc[:num_grain_text,:100]
dataframe_interest = pd.read_table("data/train-interest.dat", sep= ',').iloc[:num_interest_text,:100]
dataframe_trade = pd.read_table("data/train-trade.dat", sep= ',').iloc[:num_trade_text,:100]
dataframe_ship = pd.read_table("data/train-ship.dat", sep= ',').iloc[:num_ship_text,:100]
dataframe_wheat = pd.read_table("data/train-wheat.dat", sep= ',').iloc[:num_wheat_text,:100]
dataframe_corn = pd.read_table("data/train-corn.dat", sep= ',').iloc[:num_corn_text,:100]

dataframe = dataframe_earn.append([dataframe_crude,dataframe_money_fx,dataframe_grain,dataframe_interest,
                      dataframe_trade,dataframe_ship,dataframe_wheat,dataframe_corn],ignore_index=True, sort=False)
dataframe = dataframe.fillna(0)
early_dataframe = dataframe.copy()
dataframe

Unnamed: 0,cts,shr,net,qtr,revs,note,loss,profit,div,dividend,...,cordoba,thous,chicago,acre,marketing,shipments,spain,producers,september,duty
0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
448,0,1,1,0,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Moving Numbers to probability Space

In [9]:
dataframe.columns = ['x{}'.format(i) for i in range(0, dataframe.shape[1])]  
dataframe['sum'] = dataframe.iloc[:,:].sum(axis=1) 
dataframe = dataframe.loc[:,'x0':'x{}'.format(dataframe.shape[1]-2)].div(dataframe['sum'], axis=0) 
dataframe = dataframe.fillna(0)
dataframe 

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x550,x551,x552,x553,x554,x555,x556,x557,x558,x559
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
448,0.0,0.2,0.2,0.0,0.2,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Main

In [10]:
population_size = 20
crossover_prob = 0.7
mutation_prob = 0.2

genetic = Genetic(dataframe, population_size, crossover_prob, mutation_prob) # Creating an object from Genetic Class

df_of_best_fitness = genetic.run() # Calling run function for the object

###################################################################


df_of_best_fitness = df_of_best_fitness.dropna(axis=0)

Iteration = np.arange(df_of_best_fitness.shape[0]) 
Best_Fitness = df_of_best_fitness.best_fitness 

fig, ax = plt.subplots() 
ax.plot(Iteration, Best_Fitness) # Ploting a chart based on best fitness

ax.set(xlabel= 'Iteration', ylabel='Best Fitness')
ax.grid()

plt.xticks(Iteration)
fig.savefig("Chart.png") 
plt.show()

#################################################################


best_chromo = df_of_best_fitness.at[df_of_best_fitness.shape[0]-1,'fittest_chromosome']

clusters_dict = {j: ['x{}'.format(i) for i in range(len(best_chromo)) if best_chromo[i] == j] 
                    for j in range(1,dataframe.shape[1]+1)
                   }

best_clustering = []
for c in clusters_dict.values():
    cls = []
    for p in c:
        early_p = early_dataframe.columns.values[int(p.split('x')[1])]
        cls.append(early_p)
        
    if cls != []:    
        best_clustering.append(cls)
        

print('Best clustering is : {}'.format(best_clustering))
print('_____________________________________________________________')
print('Fitness of the best clustering is : {}'.format(df_of_best_fitness.at[df_of_best_fitness.shape[0]-1,'best_fitness']))
print('_____________________________________________________________')
print('Number of clusters is : {}'.format(len(best_clustering)))


  total_infoLoss = 1/(total_infoLoss + infoLoss)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Iteration: 0, Best Fitness: 602.7161389645715
Iteration: 1, Best Fitness: 602.7161389645715
Iteration: 2, Best Fitness: 602.7161389645715
Iteration: 3, Best Fitness: 950.2444780458196


KeyboardInterrupt: 

# Selecting features from each cluster randomly

In [None]:
num_rand_feature_in_cluster = 1 # number of features you want to choose from each cluster 
rand_feature = []

for clus in best_clustering: 
    rand_feature.append(random.sample(clus, num_rand_feature_in_cluster))

list_of_selected_feature = []
for j in rand_feature:
    for f in j:
        list_of_selected_feature.append(f)
        
print("Number of featurs reduced from ", dataframe.shape[1], " to " ,len(list_of_selected_feature))

# Droping knockout features

In [None]:
final_dataframe = early_dataframe.copy()
final_dataframe = final_dataframe[list_of_selected_feature] # Droping the features which are not in the selected list of features
final_dataframe # Final dataframe after optimization

# Adding class names to final dataframe

In [None]:
final_dataframe["class"]= ""
final_dataframe.iloc[0:num_earn_text]["class"] = "aern"
final_dataframe.iloc[num_earn_text:num_earn_text+num_crude_text]["class"] = "crude"
final_dataframe.iloc[num_earn_text+num_crude_text:num_earn_text+num_crude_text+num_money_fx_text]["class"] = "money_fx"
final_dataframe.iloc[num_earn_text+num_crude_text+num_money_fx_text:num_earn_text+num_crude_text+num_money_fx_text+num_grain_text]["class"] = "grain"
final_dataframe.iloc[num_earn_text+num_crude_text+num_money_fx_text+num_grain_text:num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text]["class"] = "interest"
final_dataframe.iloc[num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text:num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text]["class"] = "trade"
final_dataframe.iloc[num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text:num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text+num_ship_text]["class"] = "ship"
final_dataframe.iloc[num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text+num_ship_text:num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text+num_ship_text+num_wheat_text]["class"] = "wheat"
final_dataframe.iloc[num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text+num_ship_text+num_wheat_text:num_earn_text+num_crude_text+num_money_fx_text+num_grain_text+num_interest_text+num_trade_text+num_ship_text+num_wheat_text+num_corn_text]["class"] = "corn"

# final_dataframe


# Spliting data to train and test (save 20% of data for testing)

In [None]:
X, y = final_dataframe.iloc[:, :-1], final_dataframe.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)

# Shape of final data

In [None]:
number_of_train = y_train.value_counts()
number_of_test = y_test.value_counts()
print("number of train data for each class is: ")
print(number_of_train)
print("________________________________________________")
print("number of test data for each class is: ")
print(number_of_test)

# KNN classifier

In [None]:
n_neighbors = 5 # Number of neighbors

knn = KNeighborsClassifier(n_neighbors)
knn.fit(X_train, y_train)
knn_predicted = knn.predict(X_test)

print(classification_report(y_test, knn_predicted))


# F micro and F macro

In [None]:
p_macro, r_macro, f_macro, support_macro \
    = precision_recall_fscore_support(y_test, knn_predicted, average='macro')

p_micro, r_micro, f_micro, support_micro\
    = precision_recall_fscore_support(y_test, knn_predicted, average='micro')

def f(p, r):
    return 2*p*r/(p+r)

my_f_macro = f(p_macro, r_macro)

my_f_micro = f(p_micro, r_micro)

print('my f macro {}'.format(my_f_macro))

print('my f micro {}'.format(my_f_micro))
