In [676]:
"""Module Import"""

import numpy as np
import pandas as pd
import random
import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [677]:
"""실습하기 위한 데이터셋 로드 및 전처리"""

df = pd.read_csv('ToyotaCorolla.csv')
df.drop(['Id', 'Model'], axis = 1, inplace = True)
dummy_frame = pd.get_dummies(df['Fuel_Type'])

df = pd.concat([df, dummy_frame], axis = 1)
df.drop(['Fuel_Type'], axis = 1, inplace = True)
features = df.columns
df_features = features.drop(['Price']) 

In [678]:
df

Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,...,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar,CNG,Diesel,Petrol
0,13500,23,10,2002,46986,90,1,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
1,13750,23,10,2002,72937,90,1,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
2,13950,24,9,2002,41711,90,1,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
3,14950,26,7,2002,48000,90,0,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
4,13750,30,3,2002,38500,90,0,0,2000,3,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,7500,69,12,1998,20544,86,1,0,1300,3,...,0,1,1,1,0,0,0,0,0,1
1432,10845,72,9,1998,19000,86,0,0,1300,3,...,0,0,1,1,0,0,0,0,0,1
1433,8500,71,10,1998,17016,86,0,0,1300,3,...,0,0,0,1,0,0,0,0,0,1
1434,7250,70,11,1998,16916,86,1,0,1300,3,...,0,0,0,1,0,0,0,0,0,1


In [679]:
"""Hyper parameter"""

params = { 'Population':50 ,
          'Mutation_Rate':0.01,
          'cut_off':0.5,
          'parents_num':4,
          'Generation':200
          }

**Step 1: Initialize Chromosomes**

- **Encoding chromosomes**

이 방법론들을 실행하기 위한 초기화 단계. 전체 변수들의 집합을 **chromosomes**이라고 하고, 하나의 열을 **Gene**이라고 한다. binary encoding 사용. 1이면 사용하고 0이면 사용하지 않는다.즉, 어떤 변수를 쓸 것인가에 대한 정보가 Chromosomes내의 gene에 저장되어 있는 것. 보통 chromosome은 50-100개 정도 만든다.

- **Parameter**
    
    - The number of chromosome(population)
    
    - Fitness function (현재 코드에서는 $R^2$로 고정해서 사용)
    
    - Crossover mechanism
    
    - Rate of mutation
    
    - Stopping criteria (현재 코드에서는 시행 횟수로 사용)

In [680]:
"""Step 1: Initialize Chromosomes"""

population_table = [[random.random() for i in range(len(df_features))] for j in range(params['Population'])]

for i in range(np.array(population_table).shape[0]):
    for j in range(np.array(population_table).shape[1]):
        if population_table[i][j] > params['cut_off']:
            population_table[i][j] = 1
        else:
            population_table[i][j] = 0
            

np.array(population_table).shape

(50, 36)

**Step 2: Model training based on chromosomes & Step 3: Fitness Evaluation**

- step 1에서 만들어진 population을 기반으로 모델 학습 및 평가 진행. 여기서는 $R^2$지표 사용

In [682]:
"""Step 2: Model training based on chromosomes & Step 3: Fitness Evaluation"""

# Rsquare 이용하여 성능평가

R_score = []

for chromosome in population_table:
    selected_features = df_features[np.where(chromosome)]
    x = df[selected_features].values
    y = np.expand_dims(df['Price'].values, axis = 0).T
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,random_state=44,shuffle=True)
    linear_model = LinearRegression()
    linear_model.fit(x_train, y_train)
    score = linear_model.score(x_test, y_test)
    R_score.append(score)

In [683]:
R_score

[0.8314347660018006,
 0.6250709280954301,
 0.740492319542118,
 0.26502488946618263,
 0.49679375480479937,
 0.8718697678654038,
 0.8469470508656161,
 0.8467358986894672,
 0.8363396899639475,
 0.5266406777753478,
 0.8295852151177171,
 0.6963433938263347,
 0.8413090914397382,
 0.7174463879716413,
 -0.15998097903216535,
 0.30004905573947405,
 0.7248461852684667,
 0.7845558665611141,
 0.7550493746389932,
 0.8329510361175885,
 0.7515414719849289,
 0.34276432865267104,
 0.6107548925990139,
 0.6709084568829615,
 0.8561637284017818,
 0.766795784831355,
 0.6134860311981118,
 0.7798787774665463,
 -0.10521999794236647,
 0.8357497020018693,
 0.8254069286026563,
 0.5954626308882773,
 0.6651928083678934,
 0.7756835456291071,
 0.7407522594590871,
 0.6184179330881163,
 0.8397761565565682,
 0.8721872507440683,
 0.670724996526536,
 0.7436255544967062,
 0.7280556036813288,
 0.6806026074125922,
 0.8558361872094676,
 0.6832323423274271,
 0.7388393707551133,
 0.6885261927055846,
 0.8584541324557866,
 0.82609

**Step 4: Selection good chromosomes**

Population에서 한 50개의 chromosomes를 잡았을 때, 이들 모두를 통해 다음 세대를 생성하는 것이 아니라 일부 우수한 chromosomes만을 추출한다. 이때, Selection에는 두 가지의 접근법이 있다.

- **Deterministic selection**

상위 N%에 속하는 Chromosomes 모두 선택하고 나머지는 모두 선택하지 않는다. 코드에서는 몇 개를 선택할지 정하는 것으로 하고 있음.

- **Probabilistic selection**

각 chromosomes이 가지는 fitness에 따라 가중치를 나눈 후, 이 가중치에 따라서 선택될 확률을 부여한 후 추출. 열등한 chromosome이라도 기회를 부여하는 것.

In [684]:
"""Step 4: Selection good chromosomes"""

sum = 0
for i in R_score:
    sum += i
    
R_score_ratio = []
for i in R_score:
    R_score_ratio.append(i/sum)

population_table = np.concatenate((population_table, np.expand_dims(R_score_ratio,axis = 1)), axis = 1)
population_table = population_table[population_table[:,-1].argsort()[::-1],: ]


def Deterministic_selection(population_table, parents_num = params['parents_num']):
    selected_population = []
    for i in range(parents_num):
        selected_population.append(population_table[i])
    return selected_population

# 행마다 마지막 원소에 weight 포함하는 수 같이 나온다.
def Probabilistic_selection(population_table, parents_num = params['parents_num']):
    ran_num = [random.random() for i in range(parents_num)]
    ran_num = sorted(ran_num)

    selected_population = []

    sum = 0
    for i in range(len(population_table)):
        if sum < ran_num[0] < sum + population_table[:,-1][i]:
            selected_population.append(population_table[:][i])
            del ran_num[0]
        sum += population_table[:,-1][i]
        if len(ran_num) == 0:
            break
    return selected_population

In [685]:
len(population_table)

50

In [None]:
# Deterministic_selection(population_table)
# Probabilistic_selection(population_table)

**Step 5: Crossover & Mutation**

- **Crossover(reproduction)**

두 개의 부모 chromosome으로부터 두 개의 자식 chromosome을 만든다. 이때 **crossover를 통하여 부모 chromosome의 gene들을 랜덤하게 잘라서 이를 섞는다.**

- **Mutation**

앞에서 만들어진 일부 Child chromosome들에 대해서 매우 낮은 확률로 값을 반대 값으로 바꿔준다. 이를 통해서 local optimum에 빠지는 것을 방지하는 것.

In [688]:

"""Step 5: Crossover & Mutation"""

# 1. Crossover

def crossover(selected_population):
    ran_list = np.random.random_sample(len(selected_population[0]))
    sel_list = np.round(ran_list)

    for i in range(len(selected_population[0])-1):
        child1, child2 = random.sample(selected_population, 2)
        if sel_list[i] == 1:
            child1[i], child2[i] = child2[i], child1[i]
        else:
            continue
        
    return child1, child2
        
# 2. Mutation

def mutation(child1, child2, Mutation_rate = params['Mutation_Rate']):
    ran_list1 = np.random.random_sample(len(selected_population[0]))
    ran_list2 = np.random.random_sample(len(selected_population[0]))
    for i in range(len(child1)-1):
        if ran_list1[i] < Mutation_rate:
            if child1[i] == 1:
                child1[i] = 0
            else:
                child1[i] == 1
        else:
            continue
        
        if ran_list2[i] < Mutation_rate:
            if child2[i] == 1:
                child2[i] = 0
            else:
                child2[i] == 1
        else:
            continue

    return child1, child2

In [689]:
crossover(selected_population)

(array([0.        , 1.        , 1.        , 1.        , 0.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        0.        , 1.        , 0.        , 1.        , 0.        ,
        1.        , 1.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 1.        , 0.        ,
        1.        , 0.02124888]),
 array([1.        , 0.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        1.        , 1.        , 0.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 0.        , 1.        ,
        1.        , 1.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.    

In [690]:
mutation(child1, child2)

(array([0.        , 1.        , 1.        , 1.        , 0.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        0.        , 1.        , 0.        , 1.        , 0.        ,
        1.        , 1.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 1.        , 0.        ,
        1.        , 0.02124888]),
 array([1.        , 1.        , 1.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 1.        , 1.        , 1.        , 0.        ,
        1.        , 1.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        1.    

**Step 6: Find the best solution**

Step 2-5 과정을 Iteration만큼 반복하여 highest fitness 값을 가질 때 그만둔다.

In [701]:
"""Step 6: Find the best solution"""

best_gen = []

for generation in range(params['Generation']):
    population_table = [[random.random() for i in range(len(df_features))] for j in range(params['Population'])]
    for i in range(np.array(population_table).shape[0]):
        for j in range(np.array(population_table).shape[1]):
            if population_table[i][j] > params['cut_off']:
                population_table[i][j] = 1
            else:
                population_table[i][j] = 0
    
    R_score = []
    for chromosome in population_table:
        selected_features = df_features[np.where(chromosome)]
        x = df[selected_features].values
        y = np.expand_dims(df['Price'].values, axis = 0).T
        x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,random_state=44,shuffle=True)
        linear_model = LinearRegression()
        linear_model.fit(x_train, y_train)
        score = linear_model.score(x_test, y_test)
        R_score.append(score)
        
    sum = 0
    for i in R_score:
        sum += i
        
    R_score_ratio = []
    for i in R_score:
        R_score_ratio.append(i/sum)

    population_table = np.concatenate((population_table, np.expand_dims(R_score_ratio,axis = 1)), axis = 1)
    population_table = population_table[population_table[:,-1].argsort()[::-1],: ]
    Deterministic_selection(population_table)
    crossover(selected_population)
    mutation(child1, child2)

In [703]:
best_score = max(R_score)

print(best_score)

0.872063203768752
