# Определение каких-то функций

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
def kfold_generator(groups_train, n_splits=10):
    all_groups = np.unique(groups_train)
    fold_size = len(all_groups) // n_splits
    all_groups = np.random.permutation(all_groups)
    fold_groups = np.zeros((n_splits,fold_size), dtype=int)
    for i, group in enumerate(all_groups):
        fold = i // fold_size
        if fold == n_splits:
            break
        group_i = i % fold_size
        fold_groups[fold,group_i] = group
    fold_indices = {}
    for fold in range(n_splits):
        indices = np.array([], dtype = int)
        for group in fold_groups[fold]:
            indices = np.append(indices, np.argwhere(groups_train == group))
        fold_indices[fold] = indices

    for i in fold_indices:
        kf_test = fold_indices[i]
        kf_train = np.array([],dtype=int)
        for j in fold_indices:
            if i == j:
                continue
            kf_train = np.append(kf_train, fold_indices[j])
        kf_tuple = [kf_train, kf_test]
        yield (kf_train, kf_test)
        
def cross_validation(model, groups_train, kfold_generator, X, y, \
                     folds=10, th=0.5, verbose=False):    
    total_score = 0
    for i, tuple_indices in enumerate(kfold_generator(groups_train, n_splits=folds)):
        train_index, test_index = tuple_indices
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:,1]  
            
        score = f1_score(y_test, (y_pred - th > 0))
        total_score += score
        if verbose:
            print(i, "score:", score)
    mean_score = total_score / folds
    if verbose:
        print("MEAN_SCORE:", mean_score)
    return mean_score

def grid_cv(criterion_list, min_impurity_decrease_list, max_features_list, \
            min_samples_leaf_list, n_estimators_list, \
            min_samples_split_list, max_depth_list, th_list, \
            X, y, groups_train, kfold_generator, \
            folds=10, repeats=1, verbose=True):
    sample_scores = np.array([])
    sample_params = []
    for criterion in criterion_list:
        for min_impurity_decrease in min_impurity_decrease_list:
            for max_features in max_features_list:
                for min_samples_leaf in min_samples_leaf_list:
                    for n_estimators in n_estimators_list:
                        for min_samples_split in min_samples_split_list:
                            for max_depth in max_depth_list:
                                for th in th_list:
                                    curr_mean_score_list = np.array([])
                                    for r in range(repeats):
                                        model = RandomForestClassifier(random_state=0, \
                                                                     max_depth=max_depth, \
                                                                     criterion=criterion, \
                                                                     min_samples_split=min_samples_split, \
                                                                     n_estimators=n_estimators, \
                                                                     min_samples_leaf=min_samples_leaf, \
                                                                     max_features=max_features)
                                        curr_score = cross_validation(model, groups_train, kfold_generator, \
                                                                      X, y, folds=folds, th=th)
                                        curr_mean_score_list = np.append(curr_mean_score_list, curr_score)
                                    curr_mean_score = curr_mean_score_list.mean()
                                    sample_scores = np.append(sample_scores, curr_mean_score)
                                    sample_tuple = (criterion, min_impurity_decrease, max_features, \
                                                    min_samples_leaf, n_estimators, \
                                                    min_samples_split, max_depth, th)
                                    sample_params.append(sample_tuple)
                                    if verbose:
                                        print("SCORE: ", curr_mean_score, end='\t')
#                                         print("(%s; minID = %s; maxF = %s; minSL = %s; nE = %s; minSS= %s; maxD = %s; th = %s)" \
                                        print('(%s; %s; %s; %s; %s; %s; %s; %s)' \
                                              % (criterion, min_impurity_decrease, max_features, min_samples_leaf, n_estimators, min_samples_split, max_depth, th))
    best_score_index = np.argmax(sample_scores)
    best_score = sample_scores[best_score_index]
    best_params = sample_params[best_score_index]
    if verbose:
        print("\nBEST SCORE:\t", best_score)
        print("BEST PARAMS:\t", best_params)
    return best_score, best_params, sample_scores, sample_params

In [3]:
def preprocessing_1(useful_words_tsv, min_length=0):
    doc_to_title = {}
    with open(useful_words_tsv) as f:
        for num_line, line in enumerate(f):
            if num_line == 0:
                continue
            data = line.strip().split('\t', 1)
            doc_id = int(data[0])
            if len(data) == 1:
                title = ''
            else:
                title = data[1]
#           магические 5 строчек!---------
            cur = re.split(r' ',title)
            title = ''
            for i in cur:
                if len(i) >= min_length:
                    title += i + ' '
#           ------------------------------          
            doc_to_title[doc_id] = title
    return doc_to_title

def preprocessing_2(train_or_test_groups_csv, doc_to_title, train=True):
    train_data = pd.read_csv(train_or_test_groups_csv)
    traingroups_titledata = {}
    for i in range(len(train_data)):
        new_doc = train_data.iloc[i]
        doc_group = new_doc['group_id']
        doc_id = new_doc['doc_id']
        title = doc_to_title[doc_id]
        if doc_group not in traingroups_titledata:
            traingroups_titledata[doc_group] = []
        if train:
            target = new_doc['target']
            traingroups_titledata[doc_group].append((doc_id, title, target))
        else:
            traingroups_titledata[doc_group].append((doc_id, title))
    return traingroups_titledata

def preprocessing_3_old(traingroups_titledata, num_features=15, train=True):
    y_train = []
    X_train = []
    groups_train = []
    for new_group in traingroups_titledata:
        docs = traingroups_titledata[new_group] 
        for k, tup in enumerate(docs):
            if train:
                doc_id, title, target_id = tup
                y_train.append(target_id)
            else:
                doc_id, title = tup
            groups_train.append(new_group)
            all_dist = []
            words = set(title.strip().split())
            for j in range(0, len(docs)):
                if k == j:
                    continue
                if train:
                    doc_id_j, title_j, target_j = docs[j]
                else:
                    doc_id_j, title_j = docs[j]
                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
            X_train.append(sorted(all_dist, reverse=True)[0:num_features])
    if train:
        return np.array(X_train), np.array(y_train), np.array(groups_train)
    else:
        return np.array(X_train), np.array([]), np.array(groups_train)
    
def preprocessing_3(traingroups_titledata, num_features=15, num_tfidf_features=30, train=True):
    y = []
    X = []
    groups = []
    for new_group in traingroups_titledata:
        docs = traingroups_titledata[new_group] 
        list_data = []
        for k, tup in enumerate(docs):
            if train:
                doc_id, title, target_id = tup
                y.append(target_id)
            else:
                doc_id, title = tup
            list_data.append(title)  
            groups.append(new_group)

        vectorizer = TfidfVectorizer(max_features=num_tfidf_features)
        group_voc = vectorizer.fit_transform(list_data)
        dist = cosine_similarity(group_voc, group_voc)
        X_curr  = np.sort(dist, axis=1)[:,-(num_features+1):-1][:,::-1]
        X.append(X_curr)
    X = np.vstack(X)
    if train:
        return np.array(X), np.array(y), np.array(groups)
    else:
        return np.array(X), np.array(groups)
    
def preprocessing(useful_words_tsv, train_or_test_groups_csv, min_length, num_features, num_tfidf_features, 
                  train=True):
    doc_to_title = preprocessing_1(useful_words_tsv, min_length=min_length)
    traingroups_titledata = preprocessing_2(train_or_test_groups_csv, doc_to_title, train=train)
    tup = preprocessing_3(traingroups_titledata, num_features=num_features, \
                                    num_tfidf_features=num_tfidf_features, train=train)
    return tup

# Предобработка

### Title

In [530]:
useful_words_tsv = 'upload/title_output_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 10
num_tfidf_features = 1000000

X_train, y_train, groups_train = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features, \
                                              num_tfidf_features=num_tfidf_features) 
print(X_train.shape, y_train.shape, groups_train.shape)

(11690, 10) (11690,) (11690,)


### h1

In [531]:
useful_words_tsv = 'upload/h1_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 10
num_tfidf_features = 1000000

X_train_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                       min_length=min_length, num_features=num_features, \
                                       num_tfidf_features=num_tfidf_features) 
X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 20)

### h2

In [445]:
useful_words_tsv = 'upload/h2_mystem.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 3
num_features = 0
num_tfidf_features = 1000000

X_train_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                       min_length=min_length, num_features=num_features, \
                                       num_tfidf_features=num_tfidf_features) 
 
X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 1)

### h3

In [446]:
# useful_words_tsv = 'upload/h3_mystem.txt'
# train_or_test_groups_csv = 'train_groups.csv'
# min_length = 3
# num_features = 10

# X_train_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
#                                               min_length=min_length, num_features=num_features) 
# X_train = np.hstack((X_train, X_train_addition))
# X_train.shape

### URL

In [532]:
useful_words_tsv = 'upload/useful_names.txt'
train_or_test_groups_csv = 'train_groups.csv'
min_length = 0
num_features = 3
num_tfidf_features = 1000000


X_train_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                       min_length=min_length, num_features=num_features, \
                                       num_tfidf_features = num_tfidf_features) 
X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 23)

### pics

In [533]:
def do_features(name, input_file, tr_or_tst_groups):
    feature = pd.read_csv(input_file, sep='\t', encoding='utf-8', lineterminator='\n')
    group_df = pd.read_csv(tr_or_tst_groups, sep=',', encoding='utf-8', lineterminator='\n')
    feature = pd.merge(feature, group_df, on='doc_id', how='right', sort=True).dropna()
    feature['mean_'+name] = feature.groupby('group_id')[name].transform('mean')
    feature = feature[['mean_'+name, 'pair_id',name]]
    feature['pair_id'] = feature['pair_id'].astype(int)    
    feature = pd.merge(group_df, feature, on='pair_id', how='left', sort=True)
    feature['diff_'+name] = np.abs(feature['mean_'+name] - feature[name])

    return np.asmatrix(feature['diff_'+name].to_numpy()).T

In [534]:
name = 'num_images'
input_file = 'upload/pics_output.txt'
tr_or_tst_groups = 'train_groups.csv'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)

X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 24)

### digits

In [535]:
log = ''

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = log+'number_cntr'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = log+'year_psbl'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = log+'near_year_psbl'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = 'useful_num_ratio'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = log+'long_numbers'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = 'long_num_ratio'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))

input_file = 'upload/digits_output_statistics.txt'
tr_or_tst_groups = 'train_groups.csv'
name = log+'short_numbers'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)
X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 31)

### ?!...

In [536]:
name = 'excl_mrk_num'
input_file = 'upload/punctuation_output.txt'
tr_or_tst_groups = 'train_groups.csv'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)

X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 32)

In [537]:
name = 'ques_mrk_num'
input_file = 'upload/punctuation_output.txt'
tr_or_tst_groups = 'train_groups.csv'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)

X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 33)

In [538]:
name = 'poin_mrk_num'
input_file = 'upload/punctuation_output.txt'
tr_or_tst_groups = 'train_groups.csv'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)

X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 34)

In [539]:
name = 'ellp_mrk_num'
input_file = 'upload/punctuation_output.txt'
tr_or_tst_groups = 'train_groups.csv'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)

X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 35)

### size

In [540]:
name = 'file_size'
input_file = 'upload/size_output.txt'
tr_or_tst_groups = 'train_groups.csv'
X_train_addition = do_features(name, input_file, tr_or_tst_groups)

X_train = np.hstack((X_train, X_train_addition))
X_train.shape

(11690, 36)

### len

In [543]:
input_file = 'upload/lengths_output.txt'
df_temp = pd.read_csv(input_file, sep='\t', encoding='utf-8', lineterminator='\n')

In [490]:
df_temp.to_csv(input_file, sep='\t', index=False)

In [529]:
len(df_temp.doc_id.unique())

17800

In [526]:
def do_features(name, input_file, tr_or_tst_groups):
    feature = pd.read_csv(input_file, sep='\t', encoding='utf-8', lineterminator='\n')
    group_df = pd.read_csv(tr_or_tst_groups, sep=',', encoding='utf-8', lineterminator='\n')
    feature = pd.merge(feature, group_df, on='doc_id', how='right', sort=True).dropna()
    print(feature.shape)
    feature['mean_'+name] = feature.groupby('group_id')[name].transform('mean')
    feature = feature[['mean_'+name, 'pair_id',name]]
    feature['pair_id'] = feature['pair_id'].astype(int)    
    feature = pd.merge(group_df, feature, on='pair_id', how='left', sort=True)
    print(feature.shape)
    feature['diff_'+name] = np.abs(feature['mean_'+name] - feature[name])
#     print(feature['mean_'+name])
    return np.asmatrix(feature['diff_'+name].to_numpy()).T

In [544]:
input_file = 'upload/lengths_output.txt'
tr_or_tst_groups = 'train_groups.csv'
df_temp = pd.read_csv(input_file, sep='\t', encoding='utf-8', lineterminator='\n')
for i, n in enumerate(df_temp.columns):
    print(i, n)
    if i == 0 :
        continue
    X_train_addition = do_features(str(n), input_file, tr_or_tst_groups)
#     print(X_train.shape)
    X_train = np.hstack((X_train, X_train_addition))

0 doc_id
1 addres


ValueError: all the input array dimensions except for the concatenation axis must match exactly

### ngrams

In [395]:
# useful_words_tsv = 'ngrams.txt'
# train_or_test_groups_csv = 'train_groups.csv'
# min_length = 0
# num_features = 0

# X_train_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
#                                               min_length=min_length, num_features=num_features) 
# X_train = np.hstack((X_train, X_train_addition))
# X_train.shape

### Масштабирование

In [541]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)

# Подбор параметров

In [542]:
criterion_list = ['gini']
min_impurity_decrease_list = [0.5]
max_features_list = [5, 15]
min_samples_leaf_list = [4]
n_estimators_list = [100, 200, 300]
min_samples_split_list = [9]
max_depth_list = [10]
th_list = [0.3]

best_score, _, sample_scores, _ = grid_cv(criterion_list, min_impurity_decrease_list, max_features_list, \
                                    min_samples_leaf_list, n_estimators_list, \
                                    min_samples_split_list, max_depth_list, th_list, \
                                    X_train_scale, y_train, groups_train, kfold_generator, \
                                    folds=3, repeats=2, verbose=True)

SCORE:  0.71241625360844	(gini; 0.5; 5; 4; 100; 9; 10; 0.3)
SCORE:  0.7153694009476739	(gini; 0.5; 5; 4; 200; 9; 10; 0.3)
SCORE:  0.7129454348696673	(gini; 0.5; 5; 4; 300; 9; 10; 0.3)
SCORE:  0.7149752279162866	(gini; 0.5; 15; 4; 100; 9; 10; 0.3)
SCORE:  0.7153119540768076	(gini; 0.5; 15; 4; 200; 9; 10; 0.3)
SCORE:  0.7136465882400134	(gini; 0.5; 15; 4; 300; 9; 10; 0.3)

BEST SCORE:	 0.7153694009476739
BEST PARAMS:	 ('gini', 0.5, 5, 4, 200, 9, 10, 0.3)


In [164]:
criterion_list = ['gini']
min_impurity_decrease_list = [0.5]
max_features_list = [5, 7, 9]
min_samples_leaf_list = [4]
n_estimators_list = [90, 120]
min_samples_split_list = [9]
max_depth_list = [10]
th_list = [0.3]

best_score, _, sample_scores, _ = grid_cv(criterion_list, min_impurity_decrease_list, max_features_list, \
                                    min_samples_leaf_list, n_estimators_list, \
                                    min_samples_split_list, max_depth_list, th_list, \
                                    X_train_scale, y_train, groups_train, kfold_generator, \
                                    folds=3, repeats=2, verbose=True)

SCORE:  0.6485250133136395	(gini; 0.5; 5; 4; 90; 9; 10; 0.3)
SCORE:  0.6510352034170339	(gini; 0.5; 5; 4; 120; 9; 10; 0.3)
SCORE:  0.6424916967400158	(gini; 0.5; 7; 4; 90; 9; 10; 0.3)
SCORE:  0.6483689066974057	(gini; 0.5; 7; 4; 120; 9; 10; 0.3)
SCORE:  0.6515282152311481	(gini; 0.5; 9; 4; 90; 9; 10; 0.3)
SCORE:  0.6534668821352425	(gini; 0.5; 9; 4; 120; 9; 10; 0.3)

BEST SCORE:	 0.6534668821352425
BEST PARAMS:	 ('gini', 0.5, 9, 4, 120, 9, 10, 0.3)


In [115]:
criterion_list = ['gini']
min_impurity_decrease_list = [0.5]
max_features_list = [7, 14]
min_samples_leaf_list = [4]
n_estimators_list = [120, 320]
min_samples_split_list = [9]
max_depth_list = [10]
th_list = [0.3]

best_score, _, sample_scores, _ = grid_cv(criterion_list, min_impurity_decrease_list, max_features_list, \
                                    min_samples_leaf_list, n_estimators_list, \
                                    min_samples_split_list, max_depth_list, th_list, \
                                    X_train_scale, y_train, groups_train, kfold_generator, \
                                    folds=3, repeats=2, verbose=True)

SCORE:  0.6960519380004604	(gini; 0.5; 7; 4; 120; 9; 10; 0.3)
SCORE:  0.6938376377591305	(gini; 0.5; 7; 4; 320; 9; 10; 0.3)
SCORE:  0.6959058306501704	(gini; 0.5; 14; 4; 120; 9; 10; 0.3)
SCORE:  0.6926607575373621	(gini; 0.5; 14; 4; 320; 9; 10; 0.3)

BEST SCORE:	 0.6960519380004604
BEST PARAMS:	 ('gini', 0.5, 7, 4, 120, 9, 10, 0.3)


In [29]:
criterion_list = ['gini']
min_impurity_decrease_list = [0.5]
max_features_list = [14]
min_samples_leaf_list = [4]
n_estimators_list = [50, 160, 320]
min_samples_split_list = [9]
max_depth_list = [10]
th_list = [0.3]

best_score, _, sample_scores, _ = grid_cv(criterion_list, min_impurity_decrease_list, max_features_list, \
                                    min_samples_leaf_list, n_estimators_list, \
                                    min_samples_split_list, max_depth_list, th_list, \
                                    X_train_scale, y_train, groups_train, kfold_generator, \
                                    folds=3, repeats=8, verbose=True)

SCORE:  0.7076414291227606	(gini; 0.5; 14; 4; 50; 9; 10; 0.3)
SCORE:  0.7109649291094572	(gini; 0.5; 14; 4; 160; 9; 10; 0.3)
SCORE:  0.7139249060904231	(gini; 0.5; 14; 4; 320; 9; 10; 0.3)

BEST SCORE:	 0.7139249060904231
BEST PARAMS:	 ('gini', 0.5, 14, 4, 320, 9, 10, 0.3)


In [165]:
model = RandomForestClassifier(max_depth=10, min_samples_split=9, n_estimators=120,\
                   min_samples_leaf=4, max_features=9, criterion='gini')
model.fit(X_train_scale, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=120,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [166]:
imp = model.feature_importances_
f_num = np.array(range(1,len(imp)+1))
tab = np.vstack((f_num, imp)).T
df = pd.DataFrame(tab, columns=['id', 'power'])
df = df.sort_values(by=['power'], ascending=False)
df = df.reset_index()
df = df.drop(labels=['index'], axis=1)
df

Unnamed: 0,id,power
0,27.0,0.167166
1,28.0,0.104157
2,26.0,0.068981
3,20.0,0.058993
4,1.0,0.05684
5,25.0,0.055876
6,3.0,0.034748
7,17.0,0.033981
8,2.0,0.031484
9,29.0,0.030563


In [15]:
criterion_list = ['gini']
min_impurity_decrease_list = [0.5]
max_features_list = [14]
min_samples_leaf_list = [4]
n_estimators_list = [160]
min_samples_split_list = [9]
max_depth_list = [10]
th_list = [0.3]

best_score, _, sample_scores, _ = grid_cv(criterion_list, min_impurity_decrease_list, max_features_list, \
                                    min_samples_leaf_list, n_estimators_list, \
                                    min_samples_split_list, max_depth_list, th_list, \
                                    X_train_scale, y_train, groups_train, kfold_generator, \
                                    folds=3, repeats=32, verbose=True)
    
#     alpha_list, C_list, max_epoch_list, th_list, balance_ratio_list, \
#                                     X_train_scale, y_train, groups_train, kfold_generator, batch_generator, \
#                                     model_type='rforest_logreg', folds=3, repeats=32, verbose=True)

SCORE:  0.6961779561284294	(gini; 0.5; 14; 4; 160; 9; 10; 0.3)

BEST SCORE:	 0.6961779561284294
BEST PARAMS:	 ('gini', 0.5, 14, 4, 160, 9, 10, 0.3)


# Predict

In [118]:
def write_to_submission_file(predicted_labels, test_groups_csv, out_file, target='target', index_label="pair_id"):
    indices = np.asarray(pd.read_csv(test_groups_csv)[index_label])
    predicted_df = pd.DataFrame(predicted_labels, index = indices, columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [119]:
useful_words_tsv = 'upload/title_output_mystem.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 20

X_test, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                          min_length=min_length, num_features=num_features, train=False) 
print(X_test.shape)

(16627, 20)


In [120]:
useful_words_tsv = 'upload/useful_names.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 5

X_test_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                          min_length=min_length, num_features=num_features, train=False) 
X_test = np.hstack((X_test, X_test_addition))
X_test.shape

(16627, 5)


In [124]:
X_test_addition = pd.read_csv('upload/diff_features.csv', encoding='utf-8', lineterminator='\n')['diff']
X_test_addition = np.array(X_test_addition.tolist(), ndmin=2).T

X_test = np.hstack((X_test, X_test_addition))
X_test.shape

(16627, 27)

In [None]:
useful_words_tsv = 'upload/h1_mystem.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 10

X_test_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features) 
X_test = np.hstack((X_test, X_test_addition))
X_test.shape

In [None]:
useful_words_tsv = 'upload/h2_mystem.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 10

X_test_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features) 
X_test = np.hstack((X_test, X_test_addition))
X_test.shape

In [None]:
useful_words_tsv = 'upload/h3_mystem.txt'
train_or_test_groups_csv = 'test_groups.csv'
min_length = 3
num_features = 10

X_test_addition, _, _ = preprocessing(useful_words_tsv, train_or_test_groups_csv, \
                                              min_length=min_length, num_features=num_features) 
X_test = np.hstack((X_test, X_test_addition))
X_test.shape

In [125]:
scaler = StandardScaler()
scaler.fit(X_test)
X_test_scale = scaler.transform(X_test)

In [14]:
model = RandomForestClassifier(max_depth=8, min_samples_split=10, n_estimators=20,\
                   min_samples_leaf=5, max_features=7, criterion='entropy')

NameError: name 'X_test_scale' is not defined

In [None]:
model.fit(X_train_scale, y_train)
y_pred = model.predict_proba(X_test_scale)
write_to_submission_file(y_pred, 'test_groups.csv', "mod_rforest_pred_prob.csv")