#### Импорт библиотек

In [1]:
import pandas as pd
import re
import numpy as np
import json
import pickle
from urllib.request import unquote
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

#### Чтение данных

In [2]:
file_path = "gender_age_dataset.txt"

In [3]:
df = pd.read_csv(file_path, sep='\t')

#### Сбор уникальных сайтов каждого пользователя в отдельный массив

In [4]:
urls = []
for i, row in df.iterrows():
    visits = json.loads(row.user_json)['visits']
    urls_tmp = [visit['url'] for visit in visits]
    urls += list(set(urls_tmp))

#### Объединим целевые переменные в одну

In [5]:
df['target_var'] = df['age']+";"+df['gender']

#### Удалим записи без пола или возраста

In [6]:
x = df[(df.gender != '-')&(df.age != '-')]

#### Вытащим доменное имя из адресов и подсчитаем топ 30000

In [7]:
def parse_site(url):
    a = unquote(url.strip())
    b = re.search("(?:http[s]?:\/\/)+(?:www\.)?([^\/]*)",a)
    if b:
        return b.group(1)

In [8]:
sites = []
for i in range(len(urls)):
    site = parse_site(urls[i])
    if site:
        sites.append(site)

In [9]:
sites_counted = dict(Counter(sites))
site_count_tuple = [(k, sites_counted[k]) for k in sorted(sites_counted, key=sites_counted.get, reverse=True)]

In [10]:
top_sites = np.array(site_count_tuple)[:30000,0]

#### При разных запусках ноутбуков топ может немного различаться, поэтому лучше подсчитать его один раз и сохранить в файл. К тому же, этот файл будет использоваться в скрипте для сдачи.

In [11]:
def save_sites(top_sites):
    top_sites_str = "\t".join(top_sites)
    with open("top_sites", 'wb') as file:
        file.write(top_sites_str.encode("utf-8"))
        file.close()

In [12]:
def read_sites():
    with open("top_sites", 'rb') as file:
        top_sites = np.array(file.read().decode("utf-8").split("\t"))
    return top_sites

In [13]:
save_sites(top_sites)

In [14]:
top_sites = read_sites()

#### Подсчитаем матрицу с признаками

In [15]:
sites_matrix = np.zeros((len(x),len(top_sites)))

In [16]:
top_site_set = set(top_sites)

In [17]:
ind = 0
for i, row in x.iterrows():
    if ind % 5000 == 0:
        print (ind)
    json_parsed = json.loads(row.user_json)
    visits = json_parsed['visits']
    
    cur_user_sites = set([parse_site(visit['url']) for visit in visits]) - set([None])
    for site in cur_user_sites:
        if site in top_site_set:
            sites_matrix[ind][np.where(top_sites==site)[0][0]] = 1
    ind +=1

0
5000
10000
15000
20000
25000
30000
35000


In [18]:
matrix = pd.DataFrame(sites_matrix, columns = top_sites)

#### Построим pipeline и проверим его кроссвалидацией

In [19]:
pipeline = make_pipeline(SelectPercentile(chi2, 73), MultinomialNB(alpha=1.78))

In [20]:
cross_val_score(pipeline, matrix, x.target_var.values, cv=10).mean()

0.31572860472226444

#### Продемонстрируем, что выборка 50% лучших ответов повышает accuracy

In [21]:
def select_best_predictions(model, x):
    proba = model.predict_proba(x)
    max_scores = [np.sort(scores)[-1] - np.sort(scores)[-2] for scores in proba]
    thresh = np.sort(max_scores)[int(len(max_scores)*0.5)]
    ind = max_scores >= thresh
    x_best = np.array(x)[ind]
    return model.predict(x_best), ind

In [22]:
x_train, x_val, y_train, y_val = train_test_split(matrix, x.target_var.values, test_size=.1,
                                                  stratify=x.target_var.values, random_state=42)

In [23]:
pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('selectpercentile',
                 SelectPercentile(percentile=73,
                                  score_func=<function chi2 at 0x7efe3f656510>)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.78, class_prior=None, fit_prior=True))],
         verbose=False)

In [24]:
simple_predictions = pipeline.predict(x_val)

In [25]:
accuracy_score(y_val, simple_predictions)

0.3193137797454344

In [26]:
selected_predictions, selected_rows_ind = select_best_predictions(pipeline, x_val)

In [27]:
accuracy_score(y_val[selected_rows_ind], selected_predictions)

0.37133370226895407

#### Теперь осталось только обучить pipeline на всех данных и сохранить его в файл

In [28]:
pipeline.fit(matrix, x.target_var.values)

Pipeline(memory=None,
         steps=[('selectpercentile',
                 SelectPercentile(percentile=73,
                                  score_func=<function chi2 at 0x7efe3f656510>)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.78, class_prior=None, fit_prior=True))],
         verbose=False)

In [29]:
with open("final_pipeline", 'wb') as f:
    pickle.dump(pipeline, f)