In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize, StandardScaler
import joblib, pickle, random, csv

In [7]:
feat_cols = ['freq', 'query_results_count_num', 'len_mt', 'len_md', 'len_kw', 'len_w_mt', 'len_w_md', 'len_w_kw', 'words_count', 'words_count_sw', 'spamity', 'max_spam', 'water_content', 'tf_idf', 'density']
target_cols = ['pos', 'real_pos', 'page', 'is_first_page']
cols_to_analyze = ['spamity', 'water_content', 'tf_idf', 'density']
df = pd.read_csv('df_to_model_source.csv', sep=';')
queries = df['search_query_n'].unique()


In [8]:
def transliterate(name):
    """
    Автор: LarsKort
    Дата: 16/07/2011; 1:05 GMT-4;
    Не претендую на "хорошесть" словарика. В моем случае и такой пойдет,
    вы всегда сможете добавить свои символы и даже слова. Только
    это нужно делать в обоих списках, иначе будет ошибка.
    """
    # Слоаврь с заменами
    slovar = {'а':'a','б':'b','в':'v','г':'g','д':'d','е':'e','ё':'yo',
      'ж':'zh','з':'z','и':'i','й':'i','к':'k','л':'l','м':'m','н':'n',
      'о':'o','п':'p','р':'r','с':'s','т':'t','у':'u','ф':'f','х':'h',
      'ц':'c','ч':'ch','ш':'sh','щ':'sch','ъ':'','ы':'y','ь':'','э':'e',
      'ю':'u','я':'ya', 'А':'A','Б':'B','В':'V','Г':'G','Д':'D','Е':'E','Ё':'YO',
      'Ж':'ZH','З':'Z','И':'I','Й':'I','К':'K','Л':'L','М':'M','Н':'N',
      'О':'O','П':'P','Р':'R','С':'S','Т':'T','У':'U','Ф':'F','Х':'H',
      'Ц':'C','Ч':'CH','Ш':'SH','Щ':'SCH','Ъ':'','Ы':'y','Ь':'','Э':'E',
      'Ю':'U','Я':'YA',',':'','?':'',' ':'_','~':'','!':'','@':'','#':'',
      '$':'','%':'','^':'','&':'','*':'','(':'',')':'','-':'','=':'','+':'',
      ':':'',';':'','<':'','>':'','\'':'','"':'','\\':'','/':'','№':'',
      '[':'',']':'','{':'','}':'','ґ':'','ї':'', 'є':'','Ґ':'g','Ї':'i',
      'Є':'e', '—':''}
        
    # Циклически заменяем все буквы в строке
    for key in slovar:
        name = name.replace(key, slovar[key])
    return name


In [9]:
models_per_q = dict()
for query in queries:
    df_query = df[df['search_query_n'] == query]
    """
    X_train, X_test, y_train, y_test = train_test_split(
        df_query[feat_cols], 
        df_query['is_first_page'], # is_first_page, page
        test_size=0.3, 
        random_state=42
    )
    """
    X_train = df_query[feat_cols]
    y_train = df_query['is_first_page']

    RF = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42) # binary
    RF.fit(X_train, y_train)
    pred_rf = RF.predict(X_test)
    
    GBC = GradientBoostingClassifier(
        learning_rate=0.5,
        max_depth=5,
        n_estimators= 100,
        random_state = 42
    )
    GBC.fit(X_train, y_train)
    pred_GBC = GBC.predict(X_test)

    
    models_per_q[query] = dict()

    filename = transliterate(query)+'_rf.sav'
    models_per_q[query]['model_rf'] = filename
    joblib.dump(RF, '../app/models/'+filename)
    
    filename = transliterate(query)+'_gbc.sav'
    models_per_q[query]['model_gbc'] = filename
    joblib.dump(GBC, '../app/models/'+filename)
    

    models_per_q[query]['borders'] = {
        col:[df_query[df_query['page'] == 0][col].describe()['25%'], 
             df_query[df_query['page'] == 0][col].describe()['75%']] 
        for col in cols_to_analyze
    }
    
a_file = open("../app/query_dict_data.pkl", "wb")
pickle.dump(models_per_q, a_file)
a_file.close()

In [10]:
models_per_q

{'керамзитобетонный блок в название_город': {'model_rf': 'keramzitobetonnyi_blok_v_nazvanie_gorod_rf.sav',
  'model_gbc': 'keramzitobetonnyi_blok_v_nazvanie_gorod_gbc.sav',
  'borders': {'spamity': [0.036960199951196405, 0.06522276392780524],
   'water_content': [0.11359012780261948, 0.16872470536433645],
   'tf_idf': [0.016496229232281424, 0.04689328765901235],
   'density': [0.01346302802209233, 0.03827090645454176]}},
 'керамзитоблок название_город': {'model_rf': 'keramzitoblok_nazvanie_gorod_rf.sav',
  'model_gbc': 'keramzitoblok_nazvanie_gorod_gbc.sav',
  'borders': {'spamity': [0.041488459272202696, 0.06483556355712182],
   'water_content': [0.11536727708237007, 0.15984180729342243],
   'tf_idf': [0.003141070777913622, 0.01827353498082245],
   'density': [0.001990175797117229, 0.011578073089700996]}},
 'фбс название_город': {'model_rf': 'fbs_nazvanie_gorod_rf.sav',
  'model_gbc': 'fbs_nazvanie_gorod_gbc.sav',
  'borders': {'spamity': [0.0380952380952381, 0.06780776826859776],
   