In [19]:
image_queries = None
with open("./data/queries/image_queries.txt") as f:
    image_queries = f.read().splitlines()

In [20]:
len(image_queries)

18000

In [21]:
web_queries = None
with open("./data/queries/web_queries.txt") as f:
    web_queries = f.read().splitlines()

In [22]:
len(web_queries)

100000

In [23]:
from collections import Counter

In [24]:
image_counter = Counter(" ".join(image_queries).split())

In [25]:
image_counter.most_common(10)

[('фото', 1904),
 ('для', 1250),
 ('в', 1228),
 ('на', 1225),
 ('картинки', 1075),
 ('с', 790),
 ('и', 586),
 ('.us', 569),
 ('из', 554),
 ('как', 486)]

In [26]:
web_counter = Counter(" ".join(web_queries).split())

In [27]:
web_counter.most_common(10)

[('в', 9577),
 ('на', 6910),
 ('и', 5783),
 ('смотреть', 5060),
 ('онлайн', 4852),
 ('для', 3786),
 ('2016', 3109),
 ('бесплатно', 2932),
 ('2015', 2917),
 ('с', 2739)]

In [41]:
def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

In [51]:
def to_ngrams(query, n=1):
    uni = query.split()
    bi = [' '.join(t) for t in list(zip(uni, uni[1:]))]
    return uni + bi

In [54]:
from itertools import chain

# IMAGES

In [122]:
from stop_words import get_stop_words

In [136]:
def build_counter(queries):
    _chain = list(chain.from_iterable([to_ngrams(q) for q in queries]))

    chain_cleaned = [w for w in _chain if w not in get_stop_words('ru')]

    counter = Counter(chain_cleaned)
    return image_counter

In [70]:
image_chain = list(chain.from_iterable([to_ngrams(q) for q in image_queries]))

image_chain_cleaned = [w for w in image_chain if w not in get_stop_words('ru')]

image_counter = Counter(image_chain_cleaned)

image_counter.most_common(10)[:10]

[('фото', 1904),
 ('картинки', 1075),
 ('.us', 569),
 ('site:', 295),
 ('site: .us', 295),
 ('рисунки', 262),
 ('руками', 255),
 ('своими', 255),
 ('своими руками', 247),
 ('рисунок', 224)]

# WEB

In [82]:
web_chain = list(chain.from_iterable([to_ngrams(q) for q in web_queries]))

In [83]:
web_chain_cleaned = [w for w in web_chain if w not in get_stop_words('ru')]

In [84]:
web_counter = Counter(web_chain_cleaned)

In [85]:
web_counter.most_common(10)[:10]

[('смотреть', 5060),
 ('онлайн', 4852),
 ('2016', 3109),
 ('бесплатно', 2932),
 ('2015', 2917),
 ('сайт', 2586),
 ('цены', 2413),
 ('официальный', 2374),
 ('официальный сайт', 2356),
 ('каталог', 2295)]

# Naive bayes

In [88]:
import numpy as np

In [98]:
image_model = create_model(image_counter)
web_model = create_model(web_counter)

In [100]:
len(list(image_model.keys()))

54366

# Create Data set

In [102]:
import pandas as pd

18000

In [164]:
images_df = pd.DataFrame(list(zip(image_queries, [1]*len(image_queries))),
                         columns=("query","target"))

In [165]:
web_df = pd.DataFrame(list(zip(web_queries, [0]*len(web_queries))),
                         columns=("query","target"))

In [166]:
df = pd.concat([images_df, web_df])

In [167]:
df.shape

(118000, 2)

In [258]:
def to_ngrams(query, n=3):
    uni = query.split()
    bi = [' '.join(t) for t in list(zip(uni, uni[1:]))]
    return uni + bi

In [259]:
def build_counter(queries):
    _chain = list(chain.from_iterable([to_ngrams(q) for q in queries]))

    chain_cleaned = [w for w in _chain if w not in get_stop_words('ru')]

    counter = Counter(chain_cleaned)
    return counter

In [260]:
def create_model(counter):
    words, counts = zip(*counter.most_common())
    counts = np.array(counts)
    prob = counts / sum(counts)
    prob_table = {word:prob for word, prob in zip(words, prob)}
    
    return prob_table
    

In [261]:
from sklearn.cross_validation import train_test_split

In [262]:
X_train, X_test, y_train, y_test = train_test_split(df["query"], df["target"], test_size=0.33)

In [263]:
X_train_pos = X_train[y_train == 1]
X_train_neg = X_train[y_train == 0]

In [264]:
len(X_train.tolist())

79060

In [265]:
pos_model = create_model(build_counter(df[df["target"] == 1]["query"].tolist()))

In [266]:
neg_model = create_model(build_counter(df[df["target"] == 0]["query"].tolist()))

In [267]:
pos_counter = build_counter(df[df["target"] == 1]["query"].tolist())

In [268]:
neg_counter = build_counter(df[df["target"] == 0]["query"].tolist())

In [269]:
# build_counter(X_train_neg.tolist())

In [270]:
neg_counter = build_counter(X_train_neg.tolist())

In [271]:
def predict(query):
    try:
        pos = sum([pos_model[w] for w in list(build_counter([query]).keys())])
    except KeyError:
        pos = 0
    try:
        neg = sum([neg_model[w] for w in list(build_counter([query]).keys())])
    except KeyError:
        neg = 0
    return int(pos > neg)

In [272]:
y_pred = [predict(s) for s in X_test.tolist()]

In [273]:
from sklearn.metrics import f1_score

In [274]:
f1_score(y_test, y_pred)

0.64302884615384615

In [275]:
neg_counter.most_common(10)

[('смотреть', 3343),
 ('онлайн', 3193),
 ('2016', 2047),
 ('бесплатно', 1923),
 ('2015', 1910),
 ('сайт', 1688),
 ('цены', 1577),
 ('официальный', 1557),
 ('официальный сайт', 1545),
 ('скачать', 1516)]

In [276]:
pos_counter.most_common(10)

[('фото', 1904),
 ('картинки', 1075),
 ('.us', 569),
 ('site:', 295),
 ('site: .us', 295),
 ('рисунки', 262),
 ('руками', 255),
 ('своими', 255),
 ('своими руками', 247),
 ('рисунок', 224)]