# 0. Определяю какие-то функции

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re


%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

In [2]:
def batch_generator(X, y, shuffle=True, batch_size=1):
    """
    Гератор новых батчей для обучения
    X          - матрица объекты-признаки
    y_batch    - вектор ответов
    shuffle    - нужно ли случайно перемешивать выборку
    batch_size - размер батча ( 1 это SGD, > 1 mini-batch GD)
    Генерирует подвыборку для итерации спуска (X_batch, y_batch)
    """
    if batch_size > X.shape[0]:
        batch_size = X.shape[0]
    if shuffle:
        new_ids = np.random.permutation(X.shape[0])
    else:
        new_ids = np.arange(X.shape[0])
    number_batches = X.shape[0] // batch_size
    for i in range(number_batches):
        indices = range(batch_size*i, batch_size*(i+1))
        X_batch = X[new_ids[indices]]
        y_batch = y[new_ids[indices]]
        yield (X_batch, y_batch)
    if len(X) % batch_size != 0:
        indices = batch_size*(i+1)
        X_batch = X[new_ids[indices:]]
        y_batch = y[new_ids[indices:]]
        yield (X_batch, y_batch)

In [3]:
def sigmoid(x):
    """
    Вычисляем значение сигмоида.
    X - выход линейной модели
    """
    sigm_value_x = 1/(1 + np.exp(-(x)))
    return sigm_value_x


class MySGDClassifier(BaseEstimator, ClassifierMixin):   
    def __init__(self, batch_generator, batch_size=50, \
                 C=1, alpha=0.01, max_epoch=10, model_type='logreg', th=0.5):
        """
        batch_generator -- функция генератор, которой будем создавать батчи
        C - коэф. регуляризации
        alpha - скорость спуска
        max_epoch - максимальное количество эпох
        model_type - тим модели, lin_reg или log_reg
        """
        
        self.th = th
        self.C = C
        self.alpha = alpha
        self.max_epoch = max_epoch
        self.batch_generator = batch_generator
        self.errors_log = {'iter' : [], 'loss' : []}  
        self.model_type = model_type
        self.weights = []
        self.batch_size = batch_size
        
    def calc_loss(self, X_batch, y_batch):
        """
        Считаем функцию потерь по батчу 
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """
        
        loss = 0.
        if self.model_type == 'linreg':
            for x, y in zip(X_batch, y_batch):
                a = np.dot(x, self.weights)
                loss += (a-y)**2
            loss /= len(y_batch)
            loss += np.dot(self.weights[1:], self.weights[1:]) / self.C
        elif self.model_type == 'logreg':
            for x, y in zip(X_batch, y_batch):
                a = sigmoid(np.dot(x, self.weights))
                temp = a**y * (1-a)**(1-y)
                if temp < 10**(-301):                       # наугад
                    loss -= -1000                          # наугад
                    continue
                loss -= np.log2(temp)
#                 loss -= y * np.log2(a) + (1-y) * np.log2(1-a)
            loss /= len(y_batch)
            loss += np.dot(self.weights[1:], self.weights[1:]) / self.C
        return loss
    
    def calc_loss_grad(self, X_batch, y_batch):
        """
        Считаем  градиент функции потерь по батчу (то что Вы вывели в задании 1)
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """      
        loss_grad = 0.
        if self.model_type == 'linreg':
            for x, y in zip(X_batch, y_batch):
                a = np.dot(x, self.weights)
                loss_grad += (a-y)*x
            loss_grad /= len(y_batch)
            R = self.weights / self.C
            R[0] = 0
            loss_grad += R
        elif self.model_type == 'logreg':
            for x, y in zip(X_batch, y_batch):
                dot = np.dot(x, self.weights)
                a = sigmoid(dot)
                loss_grad += (a-y)*x
            loss_grad /= len(y_batch)
            R = self.weights / self.C
            R[0] = 0
            loss_grad += R
        return loss_grad
    
    def update_weights(self, new_grad):
        """
        Обновляем вектор весов
        new_grad - градиент по батчу
        """    
        alpha_k = self.alpha / self.curr_epoch**(0.005)
        self.weights = self.weights - alpha_k * new_grad
    
    def fit(self, X, y):
        '''
        Обучение модели
        X - матрица объекты-признаки
        y - вектор ответов
        '''    
        if self.model_type == 'linreg':
            y = y - np.mean(y)
        X = np.hstack((np.ones((X.shape[0],1)), X))
        self.weights = X[np.random.randint(0, X.shape[0]-1)]
        self.curr_epoch = 0
        for n in range(0, self.max_epoch):
            self.curr_epoch += 1
            new_epoch_generator = self.batch_generator(X, y, shuffle=True, batch_size=self.batch_size)
            for batch_num, new_batch in enumerate(new_epoch_generator):
                X_batch = new_batch[0]
                y_batch = new_batch[1]
                batch_grad = self.calc_loss_grad(X_batch, y_batch)
                self.update_weights(batch_grad)
                batch_loss = self.calc_loss(X_batch, y_batch)
                self.errors_log['iter'].append(batch_num)
                self.errors_log['loss'].append(batch_loss)  
        return self
     
    def predict_proba(self, X):
        '''
        Предсказание класса
        X - матрица объекты-признаки
        Не забудте тип модели (линейная или логистическая регрессия)!
        '''        
        X = np.hstack((np.ones((X.shape[0],1)), X))
        y_hat = np.array([])        
        if self.model_type == 'linreg':
            y_hat = np.dot(X, self.weights) / np.sum(X)
        elif self.model_type == 'logreg':
            dot_func = lambda x: sigmoid(np.dot(x, self.weights))
            y_hat = np.apply_along_axis(dot_func, 1, X)
        y_hat = np.vstack((1-y_hat, y_hat)).T
        return y_hat
    
    def predict(self, X):
        '''
        Предсказание класса
        X - матрица объекты-признаки
        Не забудте тип модели (линейная или логистическая регрессия)!
        '''        
        y_hat = self.predict_proba(X)
        if self.model_type == 'logreg':
            y_hat = y_hat - self.th > 0
        return y_hat

In [4]:
class rforest_plus_logreg(BaseEstimator, ClassifierMixin):
    def __init__(self, SGD_mod, RForest_mod, th=0.5, balance_ratio=0.5):
        self.SGD_mod = SGD_mod
        self.RForest_mod = RForest_mod
        self.th = th
        self.balance_ratio = balance_ratio
    def fit(self, X, y):
        self.SGD_mod.fit(X,y)
        self.RForest_mod.fit(X,y)
        return self
    def predict_proba(self, X):
        y_pred = self.SGD_mod.predict_proba(X)[:,1]
        y_pred = np.vstack((y_pred, self.RForest_mod.predict_proba(X)[:,1]))
        y_pred = (1 - self.balance_ratio) * y_pred[0] + self.balance_ratio * y_pred[1]
#         y_pred = np.mean(y_pred, axis=0)   
        y_pred = np.vstack((1-y_pred, y_pred)).T
        return y_pred
    def predict(self, X):
        y_pred = self.predict_proba(X)[:,1]
        return (y_pred - self.th > 0).astype(int)

In [107]:
def kfold_generator(groups_train, n_splits=10):
    all_groups = np.unique(groups_train)
    fold_size = len(all_groups) // n_splits
    all_groups = np.random.permutation(all_groups)
    fold_groups = np.zeros((n_splits,fold_size), dtype=int)
    for i, group in enumerate(all_groups):
        fold = i // fold_size
        if fold == n_splits:
            break
        group_i = i % fold_size
        fold_groups[fold,group_i] = group
    fold_indices = {}
    for fold in range(n_splits):
        indices = np.array([], dtype = int)
        for group in fold_groups[fold]:
            indices = np.append(indices, np.argwhere(groups_train == group))
        fold_indices[fold] = indices

    for i in fold_indices:
        kf_test = fold_indices[i]
        kf_train = np.array([],dtype=int)
        for j in fold_indices:
            if i == j:
                continue
            kf_train = np.append(kf_train, fold_indices[j])
        kf_tuple = [kf_train, kf_test]
        yield (kf_train, kf_test)
        
def cross_validation(model, groups_train, kfold_generator, X, y, \
                     folds=10, th=0.5, verbose=False):    
    total_score = 0.
    total_ac_score = 0.
    for i, tuple_indices in enumerate(kfold_generator(groups_train, n_splits=folds)):
        train_index, test_index = tuple_indices
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:,1]  
            
        score = f1_score(y_test, (y_pred - th > 0))
        ac_score = accuracy_score(y_test, (y_pred - th > 0))
        total_score += score
        total_ac_score += ac_score
        if verbose:
            print(i, "score:", score)
    mean_score = total_score / folds
    mean_ac_score = total_ac_score / folds
    if verbose:
        print("MEAN_SCORE:", mean_score)
    return mean_score, mean_ac_score

def grid_cv(alpha_list, C_list, max_epoch_list, th_list, max_features_list, \
            X, y, groups_train, kfold_generator, batch_generator, \
            model_type='logreg', folds=10, repeats=1, verbose=True):
    sample_scores = np.array([])
    sample_ac_scores = np.array([])
    sample_params = []
    for alpha in alpha_list:
        for C in C_list:
            for max_epoch in max_epoch_list:
                for th in th_list: 
                    for max_features in max_features_list: 
                        X_curr = X[:,:max_features]
                        curr_mean_score_list = np.array([])
                        curr_mean_ac_score_list = np.array([])
                        for r in range(repeats):
                            if model_type == 'rforest_logreg' or model_type == 'logreg_rforest':
                                RForest_mod = RandomForestClassifier(max_depth=8, min_samples_split=10, \
                                                                     n_estimators=20, min_samples_leaf=5, \
                                                                     max_features=7, criterion='entropy')
                                SGD_mod = MySGDClassifier(batch_generator=batch_generator, \
                                                          model_type='logreg', \
                                                          alpha=alpha, C=C, max_epoch=max_epoch)
                                model = rforest_plus_logreg(SGD_mod, RForest_mod, th=th)
                            elif model_type == 'logreg' or model_type == 'linreg':
                                model = MySGDClassifier(batch_generator=batch_generator, \
                                                        model_type=model_type, \
                                                        alpha=alpha, C=C, max_epoch=max_epoch, th=th)
                            elif model_type == 'rforest':
                                model = RandomForestClassifier(max_depth=8, min_samples_split=10, \
                                                                     n_estimators=20, min_samples_leaf=5, \
                                                                     max_features=7, criterion='entropy')

                            curr_score, curr_ac_score = cross_validation(model, groups_train, \
                                                                         kfold_generator, X_curr, y, \
                                                                         folds=folds, th=th)
                            curr_mean_score_list = np.append(curr_mean_score_list, curr_score)
                            curr_mean_ac_score_list = np.append(curr_mean_ac_score_list, curr_ac_score)
                        curr_mean_score = curr_mean_score_list.mean()
                        curr_mean_ac_score = curr_mean_ac_score_list.mean()
                        sample_scores = np.append(sample_scores, curr_mean_score)
                        sample_ac_scores = np.append(sample_ac_scores, curr_mean_ac_score)
                        sample_tuple = (alpha, C, th, max_epoch, max_features)
                        sample_params.append(sample_tuple)
                        if verbose:
                            print("SCORE: %.5f" % curr_mean_score, end='\t')
                            print("ACC: %.3f" % curr_mean_ac_score, end='\t')
                            print("(alpha = %s; C = %s; max_epoch = %s; th = %s; max_features = %s)" \
                                  % (alpha, C, max_epoch, th, max_features))
    best_score_index = np.argmax(sample_scores)
    best_score = sample_scores[best_score_index]
    best_params = sample_params[best_score_index]
    if verbose:
        print("\nBEST SCORE:\t", best_score)
        print("BEST PARAMS:\t", best_params)
    return best_score, best_params, sample_scores, sample_params

In [241]:
def preprocessing_1(useful_words_tsv, min_length=0):
    doc_to_title = {}
    with open(useful_words_tsv) as f:
        for num_line, line in enumerate(f):
            if num_line == 0:
                continue
            data = line.strip().split('\t', 1)
            doc_id = int(data[0])
            if len(data) == 1:
                title = ''
            else:
                title = data[1]
#           магические 5 строчек!---------
            cur = re.split(r' ',title)
            title = ''
            for i in cur:
                if len(i) >= min_length:
                    title += i + ' '
#           ------------------------------          
            doc_to_title[doc_id] = title
    return doc_to_title

def preprocessing_2(train_or_test_groups_csv, doc_to_title, train=True):
    train_data = pd.read_csv(train_or_test_groups_csv)
    traingroups_titledata = {}
    for i in range(len(train_data)):
        new_doc = train_data.iloc[i]
        doc_group = new_doc['group_id']
        doc_id = new_doc['doc_id']
        title = doc_to_title[doc_id]
        if doc_group not in traingroups_titledata:
            traingroups_titledata[doc_group] = []
        if train:
            target = new_doc['target']
            traingroups_titledata[doc_group].append((doc_id, title, target))
        else:
            traingroups_titledata[doc_group].append((doc_id, title))
    return traingroups_titledata

def preprocessing_3_old(traingroups_titledata, num_features=15, train=True):
    y_train = []
    X_train = []
    groups_train = []
    for new_group in traingroups_titledata:
        docs = traingroups_titledata[new_group] 
        for k, tup in enumerate(docs):
            if train:
                doc_id, title, target_id = tup
                y_train.append(target_id)
            else:
                doc_id, title = tup
            groups_train.append(new_group)
            all_dist = []
            words = set(title.strip().split())
            for j in range(0, len(docs)):
                if k == j:
                    continue
                if train:
                    doc_id_j, title_j, target_j = docs[j]
                else:
                    doc_id_j, title_j = docs[j]
                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
            X_train.append(sorted(all_dist, reverse=True)[0:num_features])
    if train:
        return np.array(X_train), np.array(y_train), np.array(groups_train)
    else:
        return np.array(X_train), np.array([]), np.array(groups_train)
    
def preprocessing_3(traingroups_titledata, num_features=15, num_tfidf_features=30, train=True):
    y = []
    X = []
    groups = []
    for new_group in traingroups_titledata:
        docs = traingroups_titledata[new_group] 
        list_data = []
        for k, tup in enumerate(docs):
            if train:
                doc_id, title, target_id = tup
                y.append(target_id)
            else:
                doc_id, title = tup
            list_data.append(title)  
            groups.append(new_group)

        vectorizer = TfidfVectorizer(max_features=num_tfidf_features)
        group_voc = vectorizer.fit_transform(list_data)
        dist = cosine_similarity(group_voc, group_voc)
        X_curr  = np.sort(dist, axis=1)[:,-(num_features+1):-1][:,::-1]
        X.append(X_curr)
    X = np.vstack(X)
    if train:
        return np.array(X), np.array(y), np.array(groups)
    else:
        return np.array(X), np.array(groups)
    
def preprocessing(useful_words_tsv, train_or_test_groups_csv, min_length, num_features, num_tfidf_features, 
                  train=True):
    doc_to_title = preprocessing_1(useful_words_tsv, min_length=min_length)
    traingroups_titledata = preprocessing_2(train_or_test_groups_csv, doc_to_title, train=train)
    tup = preprocessing_3(traingroups_titledata, num_features=num_features, \
                                    num_tfidf_features=num_tfidf_features, train=train)
    return tup

In [274]:
def total_cross_validation_old(alpha, C, max_epoch, th, \
                           model_list, X_list, y, \
                           groups_train, kfold_generator, folds=10, verbose=False):    
    total_score = 0.
    total_ac_score = 0.
    for i, tuple_indices in enumerate(kfold_generator(groups_train, n_splits=folds)):
        train_index, test_index = tuple_indices
        first_train_index, first_test_index, second_train_index, second_test_index = \
            train_index[0::2], test_index[0::2], \
            train_index[1::2], test_index[1::2]
        
        y_train, y_test = y[first_train_index], y[first_test_index]
        y_pred_model = []
        for model, X in zip(model_list, X_list):
            X_train, X_test = X[first_train_index], X[first_test_index]
            model.fit(X_train, y_train)
            y_pred_model.append(model.predict_proba(X_test)[:,1])
        
        X_train = np.vstack(y_pred_model).T       
        total_model = MySGDClassifier(batch_generator=batch_generator, model_type='logreg', \
                          alpha=alpha, C=C, max_epoch=max_epoch, th=th)
        total_model.fit(X_train, y_test)
        
        y_train, y_test = y[second_train_index], y[second_test_index]
        y_pred_model = []
        for model, X in zip(model_list, X_list):
            X_train, X_test = X[second_train_index], X[second_test_index]
            model.fit(X_train, y_train)
            y_pred_model.append(model.predict_proba(X_test)[:,1])

        X_test = np.vstack(y_pred_model).T       
        y_pred = total_model.predict(X_test)[:,1]
        
        score = f1_score(y_test, y_pred)
        ac_score = accuracy_score(y_test, y_pred)
        total_score += score
        total_ac_score += ac_score
        if verbose:
            print(i, "score:", score)
    mean_score = total_score / folds
    mean_ac_score = total_ac_score / folds
    if verbose:
        print("MEAN_SCORE:", mean_score)
    return mean_score, mean_ac_score

def total_grid_cv(alpha_list, C_list, max_epoch_list, th_list, \
                  model_list, X_list, y, \
                  groups_train, kfold_generator, batch_generator, \
                  folds=10, repeats=1, verbose=True):
    sample_scores = np.array([])
    sample_ac_scores = np.array([])
    sample_params = []
    for alpha in alpha_list:
        for C in C_list:
            for max_epoch in max_epoch_list:
                for th in th_list: 
                    curr_mean_score_list = np.array([])
                    curr_mean_ac_score_list = np.array([])
                    for r in range(repeats):
                        curr_score, curr_ac_score = total_cross_validation(alpha, C, max_epoch, th, \
                                                      model_list, X_list, y, \
                                                      groups_train, kfold_generator, folds=folds)
                        curr_mean_score_list = np.append(curr_mean_score_list, curr_score)
                        curr_mean_ac_score_list = np.append(curr_mean_ac_score_list, curr_ac_score)
                    curr_mean_score = curr_mean_score_list.mean()
                    curr_mean_ac_score = curr_mean_ac_score_list.mean()
                    sample_scores = np.append(sample_scores, curr_mean_score)
                    sample_ac_scores = np.append(sample_ac_scores, curr_mean_ac_score)
                    sample_tuple = (alpha, C, max_epoch, th)
                    sample_params.append(sample_tuple)
                    if verbose:
                        print("SCORE: %.5f" % curr_mean_score, end='\t')
                        print("ACC: %.3f" % curr_mean_ac_score, end='\t')
                        print("(alpha = %s; C = %s; max_epoch = %s; th = %s)" % (alpha, C, max_epoch, th))
    best_score_index = np.argmax(sample_scores)
    best_score = sample_scores[best_score_index]
    best_ac_score = sample_ac_scores[best_score_index]
    best_params = sample_params[best_score_index]
    if verbose:
        print("\nBEST SCORE:\t", best_score)
        print("BEST PARAMS:\t", best_params)
    return best_score, best_params, sample_scores, sample_params

In [345]:
class total_model_class(BaseEstimator, ClassifierMixin):
    def __init__(self, model_list, alpha=0.1, C=100, max_epoch=10, th=0.5):
        self.model_list = model_list
        self.th = th
        self.weights = []
        self.total_model = MySGDClassifier(batch_generator=batch_generator, model_type='logreg', \
                                                         alpha=alpha, C=C, max_epoch=max_epoch)
    def fit(self, X_first, y_first):
        X_train_list, X_test_list = X_first
        y_train, y_test = y_first
        y_pred_model = []
        for model, X_train, X_test in zip(self.model_list, X_train_list, X_test_list):
            model.fit(X_train, y_train)
            y_pred_model.append(model.predict_proba(X_test)[:,1])   
        X_train_total = np.vstack(y_pred_model).T 
    
        self.total_model.fit(X_train_total, y_test)
        self.weights = self.total_model.weights
        return self
    
    def predict_proba(self, X_second, y_second):
        X_train_list, X_test_list = X_second
        y_train = y_second
        y_pred_model = []
        for model, X_train, X_test in zip(self.model_list, X_train_list, X_test_list):
            model.fit(X_train, y_train)
            y_pred_model.append(model.predict_proba(X_test)[:,1])
        X_test_total = np.vstack(y_pred_model).T  

        y_pred = self.total_model.predict_proba(X_test_total)
        return y_pred
    
    def predict(self, X_second, y_second):
        y_pred = self.predict_proba(X_second, y_second)[:,1]
        return (y_pred - self.th > 0).astype(int)

def total_cross_validation(alpha, C, max_epoch, th, \
                           model_list, X_list, y, \
                           groups_train, kfold_generator, folds=10, verbose=False):    
    total_score = 0.
    total_ac_score = 0.
    for i, tuple_indices in enumerate(kfold_generator(groups_train, n_splits=folds)):
        train_index, test_index = tuple_indices
        first_train_index, first_test_index, second_train_index, second_test_index = \
            train_index[0::2], test_index[0::2], \
            train_index[1::2], test_index[1::2]
        
        X_first_tr_list = []
        X_first_tst_list = []
        for X in X_list:
            X_first_tr_list.append(X[first_train_index])
            X_first_tst_list.append(X[first_test_index])
        y_first = y[first_train_index], y[first_test_index]
        X_first = X_first_tr_list, X_first_tst_list
            
        X_second_tr_list = []
        X_second_tst_list = []
        for X in X_list:
            X_second_tr_list.append(X[second_train_index])
            X_second_tst_list.append(X[second_test_index])
        y_second = y[second_train_index]
        X_second = X_second_tr_list, X_second_tst_list


        total_model = total_model_class(model_list, alpha=alpha, C=C, max_epoch=max_epoch, th=th)
        total_model.fit(X_first, y_first)
        y_pred = total_model.predict(X_second, y_second)
        
        score = f1_score(y[second_test_index], y_pred)
        ac_score = accuracy_score(y[second_test_index], y_pred)
        total_score += score
        total_ac_score += ac_score
        if verbose:
            print(i, "score:", score)
    mean_score = total_score / folds
    mean_ac_score = total_ac_score / folds
    if verbose:
        print("MEAN_SCORE:", mean_score)
    return mean_score, mean_ac_score

# 1. Fit мелких моделей

## a) title_model

#### подбор параметров

In [221]:
useful_words_tsv = 'upload/title_output_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 25
num_tfidf_features = 100000

X_tr_title, y_tr_title, groups_train = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features=num_tfidf_features) 

scaler = StandardScaler()
scaler.fit(X_tr_title)
X_tr_title_sc = scaler.transform(X_tr_title)
print(X_tr_title_sc.shape)

(11690, 25)


In [222]:
alpha_list = [0.1]
C_list = [500]
max_epoch_list = [8]
th_list = [0.27]
max_features_list = [25, 20, 15]

best_score, _, sample_scores, _ = grid_cv(alpha_list, C_list, max_epoch_list, th_list, max_features_list, \
                                    X_tr_title_sc, y_tr_title, groups_train, kfold_generator, batch_generator, \
                                    model_type='logreg', folds=6, repeats=6, verbose=True)

SCORE: 0.69173	ACC: 0.806	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 25)
SCORE: 0.68302	ACC: 0.806	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 20)
SCORE: 0.68349	ACC: 0.801	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 15)

BEST SCORE:	 0.6917299594885967
BEST PARAMS:	 (0.1, 500, 0.27, 8, 25)


#### создание модели

In [226]:
useful_words_tsv = 'upload/title_output_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 25
num_tfidf_features = 100000


X_tr_title, y_tr_title, groups_train = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features=num_tfidf_features) 
scaler = StandardScaler()
scaler.fit(X_tr_title)
X_tr_title_sc = scaler.transform(X_tr_title)
print(X_tr_title_sc.shape)

(11690, 25)


In [227]:
alpha = 0.1
C = 500
max_epoch = 8

title_model = MySGDClassifier(batch_generator=batch_generator, model_type='logreg', \
                            alpha=alpha, C=C, max_epoch=max_epoch) 
title_model.fit(X_tr_title_sc, y_tr_title)

MySGDClassifier(C=500, alpha=0.1,
                batch_generator=<function batch_generator at 0x7fd89abb59d8>,
                batch_size=50, max_epoch=8, model_type='logreg', th=0.27)

In [414]:
y_pred_list = []
y_pred = h1_model.predict_proba(X_tr_title_sc)[:,1]
y_pred_list.append(y_pred)

## b) h1_model

#### подбор параметров

In [228]:
useful_words_tsv = 'upload/h1_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 25
num_tfidf_features = 100000

X_tr_h1, y_tr_h1, groups_train = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features=num_tfidf_features) 
scaler = StandardScaler()
scaler.fit(X_tr_h1)
X_tr_h1_sc = scaler.transform(X_tr_h1)
print(X_tr_h1_sc.shape)

(11690, 25)


In [229]:
alpha_list = [0.1]
C_list = [500]
max_epoch_list = [8]
th_list = [0.27]
max_features_list = [25,20,15,10,5]

best_score, _, sample_scores, _ = grid_cv(alpha_list, C_list, max_epoch_list, th_list, max_features_list, \
                                    X_tr_h1_sc, y_tr_h1, groups_train, kfold_generator, batch_generator, \
                                    model_type='logreg', folds=6, repeats=4, verbose=True)

SCORE: 0.68499	ACC: 0.807	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 25)
SCORE: 0.67993	ACC: 0.808	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 20)
SCORE: 0.68027	ACC: 0.805	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 15)
SCORE: 0.67516	ACC: 0.801	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 10)
SCORE: 0.67074	ACC: 0.795	(alpha = 0.1; C = 500; max_epoch = 8; th = 0.27; max_features = 5)

BEST SCORE:	 0.6849914256458396
BEST PARAMS:	 (0.1, 500, 0.27, 8, 25)


#### создание модели

In [231]:
useful_words_tsv = 'upload/h1_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 25
num_tfidf_features = 100000

X_tr_h1, y_tr_h1, groups_train = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features = num_tfidf_features) 
scaler = StandardScaler()
scaler.fit(X_tr_h1)
X_tr_h1_sc = scaler.transform(X_tr_h1)
print(X_tr_h1_sc.shape)

(11690, 25)


In [232]:
alpha = 0.1
C = 500
max_epoch = 8

h1_model = MySGDClassifier(batch_generator=batch_generator, model_type='logreg', \
                            alpha=alpha, C=C, max_epoch=max_epoch) 
h1_model.fit(X_tr_h1_sc, y_tr_h1)

MySGDClassifier(C=500, alpha=0.1,
                batch_generator=<function batch_generator at 0x7fd89abb59d8>,
                batch_size=50, max_epoch=8, model_type='logreg', th=0.5)

In [415]:
y_pred = h1_model.predict_proba(X_tr_h1_sc)[:,1]
y_pred_list.append(y_pred)

## с) model

# 2. Fit общей модели

In [431]:
model_list = [title_model, h1_model]
X_list = [X_tr_h1_sc, X_tr_title_sc]
y_train = y_tr_title

#### подбор параметров

In [432]:
alpha_list = [0.15]
C_list = [500]
max_epoch_list = [10, 30]
th_list = [0.27, 0.28, 0.29]

best_score, _, sample_scores, _ = total_grid_cv(alpha_list, C_list, max_epoch_list, th_list, \
                                    model_list, X_list, y_train, \
                                    groups_train, kfold_generator, batch_generator, \
                                    folds=3, repeats=2, verbose=True)

SCORE: 0.70591	ACC: 0.813	(alpha = 0.15; C = 500; max_epoch = 10; th = 0.27)
SCORE: 0.70710	ACC: 0.818	(alpha = 0.15; C = 500; max_epoch = 10; th = 0.28)
SCORE: 0.68521	ACC: 0.817	(alpha = 0.15; C = 500; max_epoch = 10; th = 0.29)
SCORE: 0.70672	ACC: 0.824	(alpha = 0.15; C = 500; max_epoch = 30; th = 0.27)
SCORE: 0.70680	ACC: 0.823	(alpha = 0.15; C = 500; max_epoch = 30; th = 0.28)
SCORE: 0.69715	ACC: 0.821	(alpha = 0.15; C = 500; max_epoch = 30; th = 0.29)

BEST SCORE:	 0.7070969040158275
BEST PARAMS:	 (0.15, 500, 10, 0.28)


#### создание модели

In [449]:
alpha = 0.15
C = 500
max_epoch = 30
th = 0.27

total_model = total_model_class(model_list, alpha=alpha, C=C, max_epoch=max_epoch, th=th)

In [462]:
repeats = 10
folds = 3
weights_list = []

for rep in range(repeats):
    for i, tuple_indices in enumerate(kfold_generator(groups_train, n_splits=folds)):
        train_index, test_index = tuple_indices
        X_first_tr_list = []
        X_first_tst_list = []
        for X in X_list:
            X_first_tr_list.append(X[train_index])
            X_first_tst_list.append(X[test_index])
        y_first = y[train_index], y[test_index]
        X_first = X_first_tr_list, X_first_tst_list

        total_model = total_model_class(model_list, alpha=alpha, C=C, max_epoch=max_epoch, th=th)
        total_model.fit(X_first, y_first)

        weights_list.append(total_model.weights)
weights = np.vstack(weights_list).mean(axis=0)
weights

array([-2.69508415,  2.54027195,  3.01626598])

In [465]:
pr = do_prediction(weights, y_pred_list, th=0.27).astype(int)
f1_score(pr, y_train), \
accuracy_score(pr, y_train)

(0.6994535519125683, 0.8306244653550042)

In [466]:
weights

array([-2.69508415,  2.54027195,  3.01626598])

In [459]:
weights

array([-1.43584684,  0.6779059 ,  0.78940243])

# 3. Test predict

In [319]:
def write_to_submission_file(predicted_labels, test_groups_csv, out_file, target='target', index_label="pair_id"):
    indices = np.asarray(pd.read_csv(test_groups_csv)[index_label])
    predicted_df = pd.DataFrame(predicted_labels, index = indices, columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [408]:
def do_prediction(weights, y_pred_list, th=0.5):
    X = np.vstack(y_pred_list)
    X = np.vstack((np.ones(X.shape[1]), X)).T
    
    dot_func = lambda x: sigmoid(np.dot(x, weights))
    y_pred = np.apply_along_axis(dot_func, 1, X)
    return (y_pred - th > 0)

## Подготовка тестовых фичей для total_model

In [375]:
y_pred_list = []

### a) X_tst_title_sc

In [376]:
useful_words_tsv = 'upload/title_output_mystem.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 25
num_tfidf_features = 100000

X_tst_title, groups_test = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features=num_tfidf_features, train=False) 

scaler = StandardScaler()
scaler.fit(X_tst_title)
X_tst_title_sc = scaler.transform(X_tst_title)
y_pred = title_model.predict_proba(X_tst_title_sc)[:,1]
y_pred_list.append(y_pred)

print(X_tst_title_sc.shape, len(y_pred_list))

(16627, 25) 1


### b) X_tst_h1_sc

In [377]:
useful_words_tsv = 'upload/h1_mystem.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 25
num_tfidf_features = 100000

X_tst_h1, groups_test = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features=num_tfidf_features, train=False) 
scaler = StandardScaler()
scaler.fit(X_tst_h1)
X_tst_h1_sc = scaler.transform(X_tst_h1)
y_pred = h1_model.predict_proba(X_tst_h1_sc)[:,1]
y_pred_list.append(y_pred)

print(X_tst_h1_sc.shape, len(y_pred_list))

(16627, 25) 2


## prediction

In [413]:
y_pred = do_prediction(weights, y_pred_list, th=0.28)
write_to_submission_file(y_pred, 'test_groups.csv', "y_pred.csv")