In [None]:
import numpy as np
import pandas as pd
import scipy
from collections import Counter
import re
import nltk
import math
from tqdm import tqdm
import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[?25l[K     |██████                          | 10 kB 23.5 MB/s eta 0:00:01[K     |███████████▉                    | 20 kB 29.7 MB/s eta 0:00:01[K     |█████████████████▊              | 30 kB 13.2 MB/s eta 0:00:01[K     |███████████████████████▋        | 40 kB 9.7 MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████████| 55 kB 2.2 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 10.6 MB/s 
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [None]:
data_train = pd.read_csv("train.csv")


In [None]:
# список ID ошибок в тренировойной части, удоляем 

dr = [1450, 3746, 4758, 5712, 10496, 12541,15300, 19390, 19801, 21141, 29584, 29763, 30037, 33568, 9628, 4903, 128933, 130223]

In [None]:
def replase_target(df, index):
    for ind in index:
        x = df.target.iloc[ind]
        df.target.iloc[ind] = ~x
    return df

In [None]:
TOKEN_PATTERN = "[А-ЯЁа-яёA-Za-z]+"

In [None]:
def tokenize(text):
    return re.findall(TOKEN_PATTERN, text.lower())

In [None]:
data_train = replase_target(data_train, dr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
string = data_train.iloc[1752].title

In [None]:
string

'ほしのあき セクシー(o´・∀・｀o)ﾆｺｯ♪.mp4'

In [None]:
string[1]

'し'

## Обработка title

In [None]:
corpus = data_train.title.values

In [None]:
docs = [tokenize(title) for title in corpus]

In [None]:
sw = set(['ru', 'mail', 'mp', 'com', 'страница', 'org', 'gp', 'net'])
stopword_set_ru = nltk.corpus.stopwords.words('russian')
stopword_set_eng = set(nltk.corpus.stopwords.words('english'))
stopwords = stopword_set_eng.union(stopword_set_ru, sw)

In [None]:
lemmatizer = pymorphy2.MorphAnalyzer()

In [None]:
lemmatizer_cache = {}

def lemmatize(token):
    if lemmatizer.word_is_known(token):
        if token not in lemmatizer_cache:
            lemmatizer_cache[token] = lemmatizer.parse(token)[0].normal_form
        return lemmatizer_cache[token]
    return token

lemmatized_docs = [[lemmatize(token) for token in text] for text in tqdm_notebook(docs)]

cleared_docs = [[token for token in text if token not in stopwords] for text in lemmatized_docs]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/135309 [00:00<?, ?it/s]

In [None]:
def conect(array):
    string = ""
    for i in range(len(array)):
        string += str(array[i])+' '
    return string.rstrip()

In [None]:
df = pd.DataFrame({"title": cleared_docs})

In [None]:
df = df["title"].apply(conect)

In [None]:
data_train.title = df

In [None]:
data_train.iloc[1752]

id               1752
url       videogg.com
title                
target          False
Name: 1752, dtype: object

In [None]:
cooccurence = Counter()

for doc in docs:
    for i in range(len(doc) - 1):
        cooccurence[(doc[i], doc[i + 1])] += 1
cooccurence.most_common(20)

[(('mail', 'ru'), 6701),
 (('смотреть', 'онлайн'), 3820),
 (('поиск', 'mail'), 3502),
 (('результатов', 'поиск'), 2622),
 (('купить', 'в'), 2107),
 (('скачать', 'бесплатно'), 2012),
 (('онлайн', 'бесплатно'), 1957),
 (('читать', 'онлайн'), 1808),
 (('в', 'хорошем'), 1629),
 (('тыс', 'результатов'), 1611),
 (('порно', 'видео'), 1529),
 (('в', 'интернет'), 1529),
 (('в', 'москве'), 1489),
 (('интернет', 'магазине'), 1419),
 (('интернет', 'магазин'), 1377),
 (('хорошем', 'качестве'), 1361),
 (('бесплатно', 'в'), 1315),
 (('на', 'olx'), 1289),
 (('мой', 'мир'), 1123),
 (('мир', 'mail'), 1054)]

## Обработка URL

In [None]:
mystopwords = ['ru','com','org','net', 'www','ua','kz']
def  remove_stopwords(text, mystopwords = mystopwords):
    string = ""
    for token in range(len(mystopwords)):
        if re.findall(mystopwords[token], text):
            text = text.split(mystopwords[token])[0]+text.split(mystopwords[token])[1]
    return text

def preprocessor(text):
    whitespaced_text = re.sub("[^a-z]", ' ', text.lower())
    return re.sub(' +', '',  whitespaced_text)

def drt(text):
    return preprocessor(text)


In [None]:
data_train.url = data_train.url.apply(drt)
data_train

Unnamed: 0,id,url,title,target
0,0,mkpmd,экс министр экономика молдова глава мидэи цель...,False
1,1,wwwkpby,песня стать известный многий телезритель благо...,False
2,2,fanserialstv,банши сезон серия бремя красота смотреть онлайн,False
3,3,colorboxspbru,бесить картинка,False
4,4,tulasportru,новомосковск сыграть следж хоккеист алексински...,False
...,...,...,...,...
135304,135304,mailru,пора тюльпан турецкий сериал русский язык резу...,False
135305,135305,wwwntvru,остросюжетный сериал шеф игра повышение серия,False
135306,135306,topclassiccarsforsalecom,plymouth special deluxe hot rod automatic smal...,False
135307,135307,wowcreamru,купить skin сыворотка питательный power formul...,False


## Обучение

In [None]:
DF2 = data_train.copy()

data_train, df_val = train_test_split(data_train, test_size=0.2, random_state=42)

In [None]:
y = DF2["target"].astype(int).values
y_train, y_val = train_test_split(y, test_size=0.2, random_state=42)

In [None]:
from sklearn.base import TransformerMixin


class ColumnExtractor(TransformerMixin):
    
    def __init__(self, column_name):
        self.column_name = column_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column_name].values

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
pipeline = Pipeline([
    (
        'features', 
        FeatureUnion([
            (
                'title', 
                Pipeline([
                    ('extractor', ColumnExtractor('title')),
                    (
                        'vectorizer', 
                        TfidfVectorizer(lowercase=True, ngram_range=(1, 1), token_pattern='[\w]+',
                            stop_words=stopwords, min_df=5, max_df=0.9)
                    )
                ])
            ),
            (
                'url',
                Pipeline([
                    ('extractor', ColumnExtractor('url')),
                    (
                        'vectorizer', 
                        TfidfVectorizer(lowercase=True, ngram_range=(2, 6),
                            stop_words=stopwords, min_df=5, max_df=0.9, analyzer='char')
                    )
                ])
            )
        ])
    ),
    ('clf', VotingClassifier(estimators = [('forest', RandomForestClassifier(n_estimators=100,
                                                                         min_samples_leaf=1,
                                                                         random_state=42,
                                                                         max_features='sqrt',
                                                                         n_jobs=-1)),
                                       ('logreg', LogisticRegression(random_state=42,
                                                                     solver="liblinear", 
                                                                     class_weight='balanced',
                                                                     max_iter=1000, C=5.0)),
                                      ],
                         voting='soft',
                         n_jobs=-1))
])

In [None]:
y = data_train["target"].astype(int).values
data_train.drop(["target"], axis = 1)
pipeline.fit(data_train, y)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('title',
                                                 Pipeline(memory=None,
                                                          steps=[('extractor',
                                                                  <__main__.ColumnExtractor object at 0x7f58cd74b0d0>),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  enc

In [None]:
f1_score(
    y,
    pipeline.predict(data_train)
)

0.9989545216936749

In [None]:
f1_score(
    y_val,
    pipeline.predict(df_val)
)

0.9846524225097804

#### Stacking

In [None]:
X = pipeline["features"].transform(DF2)

In [None]:
proba = pipeline.predict_proba(DF2)

In [None]:
X = scipy.sparse.hstack([X, proba])

In [None]:
y = DF2["target"].astype(int).values

In [None]:
pipeline["clf"].fit(X, y)

VotingClassifier(estimators=[('forest',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='sqrt',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
    

In [None]:
f1_score(
    y,
    pipeline["clf"].predict(X)
)

0.9991332675812189

#### Bagging

In [None]:
n_estimators = 10
answer = np.zeros((n_estimators, X.shape[0]), dtype=int)

In [None]:
for i in tqdm(range(n_estimators)):
    model = RandomForestClassifier(n_estimators=100,
                                   min_samples_leaf=1,
                                   random_state=42,
                                   max_features='sqrt',
                                   n_jobs=-1)
    features = np.random.choice(np.arange(X.shape[1]), X.shape[1] // 2, replace=False)
    model.fit(X.tocsc()[:, features], y)
    answer[i] = model.predict(X.tocsc()[:, features])

100%|██████████| 10/10 [20:27<00:00, 122.73s/it]


In [None]:
rsm_dt_answer = [np.bincount(answer[:, i]).argmax() for i in range(answer.shape[1])]

In [None]:
f1_score(
    y,
    rsm_dt_answer
)

0.9995515024667365

In [None]:
DF2['predict'] = rsm_dt_answer

In [None]:
DF2[(DF2.target == True) & (DF2.predict == 0)]

Unnamed: 0,id,url,title,target,predict
16184,16184,mintimlv,intim lv sludin jumu serviss,True,0
18577,18577,archiveis,callie lembke girlsdopornidreal,True,0
21452,21452,spcsme,jpg,True,0
44664,44664,yandexru,,True,0
45053,45053,mailru,pornobeauty результат поиск,True,0
70135,70135,mailru,порнхуб млн результат поиск,True,0
76526,76526,mailru,nazeli hovanisyani pornon картинка поиск,True,0
79170,79170,barevhayeram,barevhayer,True,0
79572,79572,mintimlv,intim lv sludin jumu serviss,True,0
90671,90671,hypoblogorg,wetred,True,0


## Сабмит


In [None]:
data_test  = pd.read_csv("test.csv")
data_test

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67
...,...,...,...
165373,300682,etp.armtek.ru,Armtek - запчасти для грузовых и легковых авто...
165374,300683,mail.ru,"Лилия Якупова - Караганда, Карагандинская обла..."
165375,300684,xn----8sbnqchpeeeth.xn--p1ai,Администрация Лесного района Тверской области ...
165376,300685,www-sunhome-ru.cdn.ampproject.org,Сонник Изменение сознания. К чему снится Измен...


In [None]:
corpus = data_test.title.values

In [None]:
docs = [tokenize(title) for title in corpus]
lemmatized_docs = [[lemmatize(token) for token in text] for text in tqdm_notebook(docs)]
cleared_docs = [[token for token in text if token not in stopwords] for text in lemmatized_docs]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/165378 [00:00<?, ?it/s]

In [None]:
def conect(array):
    string = ""
    for i in range(len(array)):
        string += str(array[i])+' '
    return string.rstrip()

In [None]:
df = pd.DataFrame({"title": cleared_docs})
df = df["title"].apply(conect)
data_test.title = df
data_test

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,шестой кассационный суд самара начать работа р...
1,135310,urexpert.online,индексация алименты случай производиться каков...
2,135311,imperimeha.ru,женщина империя мех part
3,135312,national-porn.com,небритый волосатый киска порно весь страна нац...
4,135313,2gis.ru,
...,...,...,...
165373,300682,etp.armtek.ru,armtek запчасть грузовой легковой автомобиль о...
165374,300683,mail.ru,лилия якупова караганда карагандинский область...
165375,300684,xn----8sbnqchpeeeth.xn--p1ai,администрация лесной район тверской область го...
165376,300685,www-sunhome-ru.cdn.ampproject.org,сонник изменение сознание сниться изменение со...


In [None]:
data_test.url = data_test.url.apply(drt)
data_test

Unnamed: 0,id,url,title
0,135309,wwwkommersantru,шестой кассационный суд самара начать работа р...
1,135310,urexpertonline,индексация алименты случай производиться каков...
2,135311,imperimeharu,женщина империя мех part
3,135312,nationalporncom,небритый волосатый киска порно весь страна нац...
4,135313,gisru,
...,...,...,...
165373,300682,etparmtekru,armtek запчасть грузовой легковой автомобиль о...
165374,300683,mailru,лилия якупова караганда карагандинский область...
165375,300684,xnsbnqchpeeethxnpai,администрация лесной район тверской область го...
165376,300685,wwwsunhomerucdnampprojectorg,сонник изменение сознание сниться изменение со...


In [None]:
X_TEST = pipeline["features"].transform(data_test)

In [None]:
X_TEST

<165378x95176 sparse matrix of type '<class 'numpy.float64'>'
	with 7184628 stored elements in Compressed Sparse Row format>

In [None]:
test_proba = pipeline.predict_proba(data_test)

In [None]:
X_TEST = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_TEST, test_proba]))

In [None]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
n_estimators = 10
answer = np.zeros((n_estimators, X_TEST.shape[0]), dtype=int)

In [None]:
for i in tqdm(range(n_estimators)):
    model = RandomForestClassifier(n_estimators=100, random_state=42, max_features="sqrt")
    features = np.random.choice(np.arange(X.shape[1]), X.shape[1] // 2, replace=False)
    model.fit(X.tocsc()[:, features], y)
    answer[i] = model.predict(X_TEST.tocsc()[:, features])

100%|██████████| 10/10 [23:46<00:00, 142.67s/it]


In [None]:
rsm_dt_answer = np.array([np.bincount(answer[:, i]).argmax() for i in range(answer.shape[1])])

In [None]:
test_df = pd.DataFrame()

In [None]:
test_df["id"] = data_test["id"]
test_df["target"] = rsm_dt_answer.astype(bool)


In [None]:
test_df

Unnamed: 0,id,target
0,135309,False
1,135310,False
2,135311,False
3,135312,True
4,135313,False
...,...,...
165373,300682,False
165374,300683,False
165375,300684,False
165376,300685,False


In [None]:
test_df.to_csv("submit_3.csv", index=False)