# 2020 Classification Data Challenge
<a href="https://www.kaggle.com/c/classification-data-challenge/overview" target="_blank">Ссылка на kaggle</a> <br>
This data challenge involves classification task. The target variable is whether a customer will click 'Buy' on a website. Buy = 0 and Buy = 1 indicate 'no' and 'yes' respectively. Independent variables capture customer background and customer website navigation behavior information. First variable in the train.csv data file identifies each customer uniquely. <br>

The challenge is to correctly predict if a customer will click 'buy' on a website.

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV


USE_GA = False # использовать генетический алгоритм

# Осмотр данных

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.describe()

Unnamed: 0,Id,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V78,V79,V80,V81,V82,V83,V84,V85,V86,Buy
count,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,...,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0,4107.0
mean,2905.181885,24.117604,1.113221,2.690528,2.989043,5.743852,0.690285,4.617726,1.061359,3.276844,...,0.00487,0.006818,0.004626,0.579255,0.00073,0.005357,0.029218,0.009252,0.014609,0.060385
std,1683.768747,12.814958,0.412409,0.784603,0.807374,2.849382,1.000736,1.742832,1.024129,1.603867,...,0.069622,0.082297,0.0747,0.563752,0.02702,0.076265,0.198934,0.098266,0.12201,0.238227
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1450.5,10.0,1.0,2.0,2.0,3.0,0.0,4.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2904.0,30.0,1.0,3.0,3.0,7.0,0.0,5.0,1.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4370.5,35.0,1.0,3.0,3.0,8.0,1.0,6.0,2.0,4.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5822.0,41.0,10.0,5.0,6.0,10.0,9.0,9.0,5.0,9.0,...,1.0,1.0,2.0,7.0,1.0,2.0,2.0,2.0,2.0,1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4107 entries, 0 to 4106
Data columns (total 87 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Id      4107 non-null   int64
 1   V2      4107 non-null   int64
 2   V3      4107 non-null   int64
 3   V4      4107 non-null   int64
 4   V5      4107 non-null   int64
 5   V6      4107 non-null   int64
 6   V7      4107 non-null   int64
 7   V8      4107 non-null   int64
 8   V9      4107 non-null   int64
 9   V10     4107 non-null   int64
 10  V11     4107 non-null   int64
 11  V12     4107 non-null   int64
 12  V13     4107 non-null   int64
 13  V14     4107 non-null   int64
 14  V15     4107 non-null   int64
 15  V16     4107 non-null   int64
 16  V17     4107 non-null   int64
 17  V18     4107 non-null   int64
 18  V19     4107 non-null   int64
 19  V20     4107 non-null   int64
 20  V21     4107 non-null   int64
 21  V22     4107 non-null   int64
 22  V23     4107 non-null   int64
 23  V24     4107 

О том, что эти данные значат, 

# Подготовка данных
## Размножение признаков
Перемножим и просуммируем их друг с другом. И просуммируем все значения 

In [5]:
def iterateColumns(df, func, numericColumns):
    """
    Для каждого признака делаем обход по остальным признакам
    и применяем к ним функцию (суммирования или умножения)
    """
    df_new = df.copy()
    df_cols = numericColumns.copy()
    for col in df_cols:
        for col2 in df_cols:
            if col2 == col: continue
            df_new[col+col2+'_'+func.__name__] = func(df_new[col], df_new[col2])
        df_cols.remove(col)
    return df_new

def SumEach(df_col1, df_col12):
    return df_col1+df_col12

def SumAll(df, numericColumns):
    return df[numericColumns].sum(axis=1)

def MultiEach(df_col1, df_col12):
    return df_col1*df_col12

# В данном датасете - все признаки числовые, кроме Id и Buy
numericColumns = [col for col in data.columns if col != 'Id' and col != 'Buy']

In [6]:
new_data = iterateColumns(data, MultiEach, numericColumns)
new_data = iterateColumns(new_data, SumEach, numericColumns)
new_data['SumAll'] = SumAll(new_data, numericColumns)
new_data.head()

Unnamed: 0,Id,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V86V69_SumEach,V86V71_SumEach,V86V73_SumEach,V86V75_SumEach,V86V77_SumEach,V86V79_SumEach,V86V81_SumEach,V86V83_SumEach,V86V85_SumEach,SumAll
0,1,33,1,3,2,8,0,5,1,3,...,1,0,0,0,0,0,1,0,0,162
1,2,37,1,2,2,8,1,4,1,4,...,0,0,0,0,0,0,1,0,0,160
2,3,37,1,2,2,8,0,4,2,4,...,1,0,0,0,0,0,1,0,0,162
3,4,9,1,3,3,3,2,3,2,4,...,1,0,0,0,0,0,1,0,0,131
4,6,23,1,2,1,5,0,5,0,5,...,1,0,0,0,0,0,0,0,0,144


## Отбор признаков
С помощью генетического алгоритма отберем признаки, с которым модель даёт лучший результат

In [7]:
class GeneticAlgorithm:
    def __init__(self, df, target, model, num_chromes=50, itarations=100):
        """
        df - датасет
        target - целевой признак
        model - модель
        num_chromes - количество хромосом
        itarations - количество итераций выполнения цикла генетического алгоритма
        """
        if num_chromes%2 != 0:
            raise Exception("Количество хромосом должно быть четным!")
        self.num_chromes = num_chromes
        self.itarations = itarations
        self.df = df
        self.model = model
        self.target = target
        columns = df.drop([target], axis=1).columns # признаки для обучения
        self.num_columns = len(columns) # количество признаков
        # словарь: номер признака => название признака
        self.label_columns = {i:columns[i] for i in range(self.num_columns)}

    def initGeneration(self):
        """
        Инцииализация первого поколения хромосом.
        Хромосома - набор "включенных" и "выключенных" признаков.
        Ген - признак, значение "1" (включен) или "0" (выключен).
        Только "включенные" признаки будут учавствовать в обучении модели.
        """
        self.generation = [[random.getrandbits(1) for i in range(self.num_columns)] for _ in range(self.num_chromes)]

    def getColByChrome(self, chrome):
        """
        Отбор включенных генов/признаков
        """
        return [self.label_columns[i] for i in range(len(chrome)) if chrome[i] == 1]
    
    def crossover(self, c1, c2):
        """
        Скрещивание.
        c1, c2 - хромосомы
        Возвращются 2 новые хромосомы
        """
        start_pos = random.randint(0, self.num_columns-2)
        end_pos = random.randint(start_pos, self.num_columns-1)
        pairs = []
        pairs.append(c2[0:start_pos]+c1[start_pos:end_pos]+c2[end_pos:])
        pairs.append(c1[0:start_pos]+c2[start_pos:end_pos]+c1[end_pos:])
        return pairs
    
    def crossoverStep(self):
        """
        Обход хромосом для скрещивания.
        """
        new_chromes = []
        for i in range(0, len(self.generation), 2):
            c1 = self.generation[i]
            c2 = self.generation[i+1]
            new_chromes = new_chromes + self.crossover(c1, c2)

        self.generation = self.generation + new_chromes
    
    def mutation(self):
        """
        Каждая хромосома с 5% вероятностью может мутировать,
        т.е. переключить один свой ген/признак
        """
        for chrome in self.generation:
            if random.randint(1, 100) <= 5:
                index_mutation = random.randint(0, self.num_columns-1)
                chrome[index_mutation] = abs(chrome[index_mutation] - 1) # switch
                
    def fitChrome(self, chrome):
        """
        Обучение модели на хромосоме
        """
        cols = self.getColByChrome(chrome)
        X, y = self.df[cols], self.df[self.target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
        self.model.fit(X_train, y_train)
        pred_rf = self.model.predict(X_test)
        return f1_score(y_test, pred_rf)
    
    def mainLoop(self):
        self.initGeneration()
        for _ in range(self.itarations):
            
            self.crossoverStep()
            
            self.mutation()
            
            fitted_list = []
            for i in range(len(self.generation)):
                f1_score_chrome = self.fitChrome(self.generation[i])
                fitted_list.append([i, f1_score_chrome])

            # сортировка хромосом по точности модели
            fitted_list = sorted(fitted_list , key=lambda row: row[1], reverse=True)
            self.generation = [self.generation[fitted_list[i][0]] for i in range(self.num_chromes)]
        
        return self.getColByChrome(self.generation[0])


In [8]:
RF = RandomForestClassifier(
    n_estimators=350, 
    max_depth=10, 
    random_state=42, 
    min_samples_split=20,
    min_samples_leaf=80,
    max_features='sqrt',
    bootstrap=False,
    class_weight='balanced',
    n_jobs=4
)


In [13]:
%%time
if USE_GA: #Время выполнения: 1h 52min 55s
    ga_item = GeneticAlgorithm(new_data.drop(['Id'], axis=1), 'Buy', RF, 50, 50)
    cols = ga_item.mainLoop()
    with open('columns.txt', 'w') as f:
        for col in cols:
            f.write("%s\n" % col)
else:
    cols = []
    with open('columns.txt', 'r') as f:
        for col in f:
            cols.append(col.rstrip('\n'))

Wall time: 2 ms


Количество признаков

In [14]:
len(cols)

2732

# Обучение модели
## Поиск парамтеров модели решетчатым поисков

In [16]:
X, y = new_data[cols], new_data['Buy']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

RF
pred_rf: 0.7338208256727394
pred_rf_f1: 0.26625386996904027


In [13]:
%%time
RF = RandomForestClassifier()
parameters = {
    'n_estimators':[350, 400, 450],
    'max_depth':[5, 10, 15],
    'random_state':[42],
    'min_samples_split':[20],
    'min_samples_leaf':[80],
    'max_features':['sqrt'],
    'class_weight':['balanced'],
    'bootstrap':[False],
    'n_jobs':[4],
}
clf = GridSearchCV(RF, parameters)
clf.fit(new_data[cols], new_data['Buy'])

Wall time: 1min 27s


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [False], 'class_weight': ['balanced'],
                         'max_depth': [5, 10, 15], 'max_features': ['sqrt'],
                         'min_samples_leaf': [80], 'min_samples_split': [20],
                         'n_estimators': [350, 400, 450], 'n_jobs': [4],
                         'random_state': [42]})

In [14]:
clf.best_params_

{'bootstrap': False,
 'class_weight': 'balanced',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 80,
 'min_samples_split': 20,
 'n_estimators': 400,
 'n_jobs': 4,
 'random_state': 42}

## Тестовые данные

In [17]:
df_test = pd.read_csv('test.csv')
# размножение данных
new_data_test = iterateColumns(df_test, MultiEach, numericColumns)
new_data_test = iterateColumns(new_data_test, SumEach, numericColumns)
new_data_test['SumAll'] = SumAll(new_data_test, numericColumns)
print(new_data_test.shape)
new_data_test.head()

(1715, 5505)


Unnamed: 0,Id,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V86V69_SumEach,V86V71_SumEach,V86V73_SumEach,V86V75_SumEach,V86V77_SumEach,V86V79_SumEach,V86V81_SumEach,V86V83_SumEach,V86V85_SumEach,SumAll
0,5,40,1,4,2,10,1,4,1,4,...,0,0,0,0,0,0,1,0,0,166
1,14,41,1,3,3,10,0,5,0,4,...,0,0,0,0,0,0,0,0,0,162
2,16,33,1,2,3,8,0,7,0,2,...,0,0,0,0,0,0,1,0,0,158
3,26,33,1,3,3,8,0,6,1,2,...,0,0,0,0,0,0,1,0,0,160
4,28,40,1,3,3,10,0,3,0,6,...,0,0,0,0,0,0,1,0,0,163


Обучение модели на всех данных и предсказание на тестовых

In [18]:
RF = RandomForestClassifier(
    n_estimators=400, 
    max_depth=10, 
    random_state=42, 
    min_samples_split=20,
    min_samples_leaf=80,
    max_features='sqrt',
    bootstrap=False,
    class_weight='balanced',
    n_jobs=4
)
RF.fit(new_data[cols], new_data['Buy'])
ypred = pd.DataFrame({'Predicted': pd.Series(RF.predict(new_data_test[cols]))})
df_res = pd.concat([new_data_test['Id'], ypred], axis=1)
ypred.head()

Unnamed: 0,Predicted
0,0
1,0
2,0
3,0
4,0


In [19]:
df_res.to_csv('sub1.csv', index=False)