## Загрузка и предобработка

In [20]:
import sklearn
import sklearn.datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sklearn.metrics as metrics
import sklearn.naive_bayes
from sklearn.model_selection import train_test_split, cross_val_score

import pandas as pd
import scipy as sp
import numpy as np
from importlib import reload
from textblob import TextBlob
import gc

import preprocessing

In [7]:
target_folder = 'Dataset'
train_data = pd.read_csv(target_folder + '/train.csv')
test_data = pd.read_csv(target_folder + '/test.csv')
submission_example = pd.read_csv(target_folder + '/sample_submission.csv')
submission_example

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
5,0001ea8717f6de06,0.5,0.5,0.5,0.5,0.5,0.5
6,00024115d4cbde0f,0.5,0.5,0.5,0.5,0.5,0.5
7,000247e83dcc1211,0.5,0.5,0.5,0.5,0.5,0.5
8,00025358d4737918,0.5,0.5,0.5,0.5,0.5,0.5
9,00026d1092fe71cc,0.5,0.5,0.5,0.5,0.5,0.5


In [10]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000
)
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000
)

In [9]:
X_word = word_vectorizer.fit_transform(train_data['comment_text'])
X_char = char_vectorizer.fit_transform(train_data['comment_text'])
X = sp.sparse.hstack([X_word, X_ch])
x_word = None
X_char = None
gc.collect()

In [14]:
y = train_data[targets]

## Валидация

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
for target in targets:
    clf = sklearn.linear_model.LogisticRegression(C=1., verbose=2, solver='sag', n_jobs=-1)
    #clf = sklearn.naive_bayes.MultinomialNB(alpha=0.98)
    clf.fit(X_train, y_train[target])
    pred = clf.predict_proba(X_test)[:,1]
    print(sklearn.metrics.roc_auc_score(y_test[target], pred))

convergence after 20 epochs took 78 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.3min finished


0.977475222886
convergence after 22 epochs took 86 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.4min finished


0.987803970818
convergence after 20 epochs took 77 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.3min finished


0.990246057884
convergence after 19 epochs took 74 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.2min finished


0.991503513817
convergence after 24 epochs took 94 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.6min finished


0.982478958984
convergence after 20 epochs took 78 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.3min finished


0.984657278593


In [None]:
X_train = None
X_test = None
y_train = None
y_test = None
gc.collect()

## Предсказание

In [21]:
X_word_test = word_vectorizer.transform(test_data['comment_text'])
X_char_test = char_vectorizer.transform(test_data['comment_text'])
X_test = sp.sparse.hstack([X_word_test, X_char_test])

In [22]:
for target in targets:
    clf = sklearn.linear_model.LogisticRegression(C=1., verbose=2, solver='sag', n_jobs=-1)
    clf.fit(X, y[target])
    pred = clf.predict_proba(X_test)[:,1]
    submission_example[target] = pred

convergence after 22 epochs took 129 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.2min finished


convergence after 23 epochs took 130 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.2min finished


convergence after 21 epochs took 116 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.9min finished


convergence after 19 epochs took 98 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.6min finished


convergence after 21 epochs took 106 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.8min finished


convergence after 21 epochs took 108 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.8min finished


In [25]:
submission_example

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999904,0.226171,0.999608,0.042632,0.978629,0.350139
1,0000247867823ef7,0.005605,0.002111,0.002864,0.000547,0.004175,0.002275
2,00013b17ad220c46,0.010256,0.002079,0.006295,0.000419,0.003253,0.001172
3,00017563c3f7919a,0.002869,0.001667,0.002227,0.000764,0.002707,0.000457
4,00017695ad8997eb,0.017332,0.001500,0.004869,0.000740,0.007127,0.001213
5,0001ea8717f6de06,0.005131,0.000853,0.002371,0.001016,0.007072,0.001076
6,00024115d4cbde0f,0.002997,0.000717,0.003438,0.000253,0.003335,0.000828
7,000247e83dcc1211,0.463167,0.002193,0.018412,0.001947,0.037457,0.002754
8,00025358d4737918,0.003843,0.001279,0.005830,0.000831,0.003090,0.001250
9,00026d1092fe71cc,0.003672,0.000816,0.002747,0.000522,0.004248,0.000754


In [26]:
X_word_test = None
X_char_test = None
X_test = None
gc.collect()

1090

## Постобработка

In [27]:
significance_level = 0.01
trust_level = 1 - significance_level

In [28]:
#submission_example[submission_example < significance_level] = 0
#submission_example[submission_example > significance_level] = 1
submission_example[submission_example < significance_level]

NameError: name 'submission_example' is not defined

In [30]:
submission_example.to_csv('logreg_tfidf_cut_3.csv', index=False)