In [599]:
from sklearn.decomposition import TruncatedSVD

In [600]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [601]:
import re
import pandas as pd
import gensim
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from pymystem3 import Mystem
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

In [602]:
for_final_table = []

# Описание

## 1 - cat, 0 - molecule

In [603]:
cat = open('Cat.txt').read().split('\n')

In [604]:
molecule = open('Molecule.txt').read().split('\n')

In [605]:
cat_clear = []
molecule_clear = []

In [606]:
for text in cat:
    text_new = re.sub('\[.*?\] *', '', text)
#     text_new = re.sub('(кош(е)?к[а-я]?)', 'word', text_new)
    cat_clear.append(text_new)

for text in molecule:
    text_new = re.sub('\[.*?\] *', '', text)
#     text_new = re.sub('(молекул[а-я]*)', 'word', text_new)
    molecule_clear.append(text_new)

In [607]:
df = pd.DataFrame(cat_clear[1:], columns=['text'])

In [608]:
df['class'] = 1

In [609]:
df_2 = pd.DataFrame(molecule_clear[1:], columns=['text'])

In [610]:
df_2['class'] = 0

In [611]:
df = df.append(df_2, ignore_index=True)

## W2V

In [612]:
m = Mystem()

In [613]:
sents = []
for sent in df.text:
    arr = []
    words = word_tokenize(sent)
    for word in words:
        lemma = m.lemmatize(word)[0]
        if lemma == 'кошка' or lemma == 'молекула':
            arr.append('XXX')
        else:
            arr.append(lemma)
    sents.append(arr)

In [614]:
model = gensim.models.Word2Vec(sents, size=300, window=2, min_count=5, workers=4)

In [615]:
df_vecs = []

In [616]:
for sent in sents:
    vec = []
    for word in sent:
        try:
            vec.append(model[word])
        except:
            pass
    
    
    df_vecs.append(sum(vec)/len(vec))

In [617]:
X_train, X_test, y_train, y_test = train_test_split(df_vecs, df['class'], test_size=0.33, random_state=42)

In [618]:
w2v_gb = GradientBoostingClassifier(n_estimators=250, random_state=2)
w2v_rf = RandomForestClassifier(random_state=2)
w2v_ada = AdaBoostClassifier(random_state=2)

In [619]:
for cl, name in zip([w2v_gb,w2v_rf, w2v_ada], ['w2v_gb','w2v_rf','w2v_ada']):
    cl.fit(X_tr, y_train)
    y_pred = cl.predict(X_ts)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.839662447257
0.731141199226
0.839400428266


### TF_IDF

In [620]:
vectorizer = TfidfVectorizer()

In [621]:
sents_join = [' '.join(sent) for sent in sents]

In [622]:
X_train, X_test, y_train, y_test = train_test_split(sents_join, df['class'], test_size=0.33, random_state=42)

In [623]:
tf_gb = GradientBoostingClassifier(n_estimators=250, random_state=2)
tf_rf = RandomForestClassifier(random_state=2)
tf_ada = AdaBoostClassifier(random_state=2)

In [624]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [648]:
for cl, name in zip([tf_gb, tf_rf, tf_ada], ['tf_gb','tf_rf','tf_ada']):
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    score = f1_score(y_test, y_pred)
    print(score)
    print(classification_report(y_test, y_pred))
    for_final_table.append(name+' '+str(score))

0.854251012146
             precision    recall  f1-score   support

          0       0.93      0.75      0.83       238
          1       0.78      0.94      0.85       224

avg / total       0.86      0.84      0.84       462

0.800821355236
             precision    recall  f1-score   support

          0       0.85      0.71      0.78       238
          1       0.74      0.87      0.80       224

avg / total       0.80      0.79      0.79       462

0.800821355236
             precision    recall  f1-score   support

          0       0.85      0.71      0.78       238
          1       0.74      0.87      0.80       224

avg / total       0.80      0.79      0.79       462



In [657]:
# tf_gb.fit(X_train, y_train)

In [654]:
five_examples_cat = ['Моя кошка прибежала ко мне и начала просить с ней поиграть', 'За кошкой нужно хорошо ухаживать', 'Все кошки хорошо видят в темноте','Я вчера подобрал на улице бездомную кошечку', 'Кошки и собаки не дружат']

In [655]:
five_examples_molecule = ['Молекула - это наименьшая частица вещества, определяющая его свойства и способная к самостоятельному существованию.','Вещества состоят из множества молекул', 'В школе мы проходили, как устроены разные молекулы', ' Атомы и молекулы чрезвычайно малы', 'К числу важных классов биологических молекул относятся белки, углеводы, липиды и нуклеиновые кислоты']

In [656]:
for s in five_examples_molecule:
    vec = vectorizer.transform([s])
    print(s)
    print(('prediction:{}').format(tf_gb.predict(vec)))

Молекула - это наименьшая частица вещества, определяющая его свойства и способная к самостоятельному существованию.
prediction:[1]
Вещества состоят из множества молекул
prediction:[1]
В школе мы проходили, как устроены разные молекулы
prediction:[1]
 Атомы и молекулы чрезвычайно малы
prediction:[1]
К числу важных классов биологических молекул относятся белки, углеводы, липиды и нуклеиновые кислоты
prediction:[1]


## Уменьшение размерности tf-idf с PCA

In [587]:
pca = TruncatedSVD(n_components=1000)

In [588]:
X_tr = pca.fit_transform(X_train)

In [589]:
X_ts = pca.transform(X_test)

In [590]:
pca_gb = GradientBoostingClassifier(n_estimators=250, random_state=24)
pca_rf = RandomForestClassifier(random_state=24)
pca_ada = AdaBoostClassifier(random_state=24)

In [591]:
for cl, name in zip([pca_gb,pca_rf, pca_ada], ['pca_tf_gb','pca_tf_rf','pca_tf_ada']):
    cl.fit(X_tr, y_train)
    y_pred = cl.predict(X_ts)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.839662447257
0.790890269151
0.839400428266


## Уменьшение размерности w2v до 150

In [592]:
model_2 = gensim.models.Word2Vec(sents, size=150, window=2, min_count=5, workers=4)

In [593]:
df_vecs_2 = []
for sent in sents:
    vec = []
    for word in sent:
        try:
            vec.append(model_2[word])
        except:
            pass
    
    
    df_vecs_2.append(sum(vec)/len(vec))

X_train, X_test, y_train, y_test = train_test_split(df_vecs_2, df['class'], test_size=0.33, random_state=42)

In [594]:
smallw2v_gb = GradientBoostingClassifier(n_estimators=250, random_state=24)
smallw2v_rf = RandomForestClassifier(random_state=24)
smallw2v_ada = AdaBoostClassifier(random_state=24)

In [595]:
for cl, name in zip([smallw2v_gb,smallw2v_rf, smallw2v_ada], ['smallw2v_gb','smallw2v_rf','smallw2v_ada']):
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.714606741573
0.635071090047
0.646924829157


In [596]:
for item in for_final_table:
    print(item)
    print('______________')

w2v_gb 0.839662447257
______________
w2v_rf 0.731141199226
______________
w2v_ada 0.839400428266
______________
tf_gb 0.854251012146
______________
tf_rf 0.800821355236
______________
tf_ada 0.800821355236
______________
pca_tf_gb 0.839662447257
______________
pca_tf_rf 0.790890269151
______________
pca_tf_ada 0.839400428266
______________
smallw2v_gb 0.714606741573
______________
smallw2v_rf 0.635071090047
______________
smallw2v_ada 0.646924829157
______________
