In [1]:
from sklearn.decomposition import TruncatedSVD

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import re
import pandas as pd
import gensim
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
from pymystem3 import Mystem
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

In [4]:
from stop_words import get_stop_words
stop_words_421 = get_stop_words('russian')

In [96]:
for_final_table = []

# Описание и результаты

# Код

## 1 - cat, 0 - molecule

In [97]:
cat = open('Cat.txt').read().split('\n')

In [98]:
molecule = open('Molecule.txt').read().split('\n')

In [99]:
cat_clear = []
molecule_clear = []

In [100]:
for text in cat:
    text_new = re.sub('\[.*?\] *', '', text)
    text_new = text_new.replace('\n', '')
#     text_new = re.sub('(кош(е)?к[а-я]?)', 'word', text_new)
    if any([x.isalpha() for x in text_new]):
        cat_clear.append(text_new)

for text in molecule:
    text_new = re.sub('\[.*?\] *', '', text)
    text_new = text_new.replace('\n', '')
#     text_new = re.sub('(молекул[а-я]*)', 'word', text_new)
    if any([x.isalpha() for x in text_new]):
        molecule_clear.append(text_new)

In [101]:
df = pd.DataFrame(cat_clear[1:], columns=['text'])

In [102]:
df['class'] = 1

In [103]:
df_2 = pd.DataFrame(molecule_clear[1:], columns=['text'])

In [104]:
df_2['class'] = 0

In [105]:
df = df.append(df_2, ignore_index=True)

## W2V

In [106]:
m = Mystem()

In [107]:
sents = []
for sent in df.text:
    arr = []
    words = word_tokenize(sent)
    for word in words:
        lemma = m.lemmatize(word)[0]
        if lemma == 'кошка' or lemma == 'молекула':
            arr.append('XXX')
        else:
            if not '\n' in lemma and lemma not in stop_words_421:
                if any([x.isalpha() for x in lemma]):
                    arr.append(lemma)
    if arr:
        sents.append(arr)
    else:
        print(":{}:".format(sent))

In [108]:
# sents[0]

In [109]:
model = gensim.models.Word2Vec(sents, size=300, window=5, min_count=5, workers=4)

In [110]:
df_vecs = []

In [111]:
for sent in sents:
    vec = []
    for word in sent:
        try:
            vec.append(model[word])
        except:
            pass
    
    try:
        df_vecs.append(sum(vec)/len(vec))
    except:
        print(sent)

In [112]:
X_train, X_test, y_train, y_test = train_test_split(df_vecs, df['class'], test_size=0.33, random_state=42)

In [113]:
w2v_gb = GradientBoostingClassifier(n_estimators=250, random_state=2)
w2v_rf = RandomForestClassifier(random_state=2)
w2v_ada = AdaBoostClassifier(random_state=2)

In [114]:
for cl, name in zip([w2v_gb,w2v_rf, w2v_ada], ['w2v_gb','w2v_rf','w2v_ada']):
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.632352941176
0.55525606469
0.642685851319


### TF_IDF

In [115]:
vectorizer = TfidfVectorizer()

In [116]:
sents_join = [' '.join([x for x in sent if x != 'XXX' and x not in stop_words_421]) for sent in sents]

In [118]:
# sents_join[0]

In [119]:
X_train, X_test_raw, y_train, y_test = train_test_split(sents_join, df['class'], test_size=0.33, random_state=42)

In [120]:
tf_gb = GradientBoostingClassifier(n_estimators=250, random_state=2)
tf_rf = RandomForestClassifier(random_state=2)
tf_ada = AdaBoostClassifier(random_state=2)

In [121]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test_raw)

In [122]:
for cl, name in zip([tf_gb, tf_rf, tf_ada], ['tf_gb','tf_rf','tf_ada']):
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.855421686747
0.886554621849
0.847107438017


In [123]:
five_examples_cat = ['Моя кошка прибежала ко мне и начала просить с ней поиграть', 'За кошкой нужно хорошо ухаживать', 'Все кошки хорошо видят в темноте','Я вчера подобрал на улице бездомную кошечку', 'Кошки и собаки не дружат']

In [124]:
five_examples_molecule = ['Молекула - это наименьшая частица вещества, определяющая его свойства и способная к самостоятельному существованию.','Вещества состоят из множества молекул', 'В школе мы проходили, как устроены разные молекулы', ' Атомы и молекулы чрезвычайно малы', 'К числу важных классов биологических молекул относятся белки, углеводы, липиды и нуклеиновые кислоты']

## Уменьшение размерности tf-idf с PCA

In [125]:
pca = TruncatedSVD(n_components=1000)

In [126]:
X_tr = pca.fit_transform(X_train)

In [127]:
X_ts = pca.transform(X_test)

In [128]:
pca_gb = GradientBoostingClassifier(n_estimators=250, random_state=24)
pca_rf = RandomForestClassifier(random_state=24)
pca_ada = AdaBoostClassifier(random_state=24)

In [129]:
for cl, name in zip([pca_gb,pca_rf, pca_ada], ['pca_tf_gb','pca_tf_rf','pca_tf_ada']):
    cl.fit(X_tr, y_train)
    y_pred = cl.predict(X_ts)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.915766738661
0.781038374718
0.91914893617


In [136]:
# my = ['процесс', 'научный изучение', 'концентрация взрыв кристалл превращаться процесс', 'низкий температура упорядоченный процесс приводить к неожиданный научный вывод']
for s in five_examples_cat:
    s = ''.join(m.lemmatize(s))
    vec = vectorizer.transform([s])
    vec = pca.transform(vec)
    print(s)
    print(('prediction:{}').format(pca_ada.predict(vec)))

мой кошка прибегать ко я и начинать просить с она поиграть

prediction:[1]
за кошка нужно хорошо ухаживать

prediction:[1]
весь кошка хорошо видеть в темнота

prediction:[1]
я вчера подбирать на улица бездомный кошечка

prediction:[1]
кошка и собака не дружить

prediction:[1]


## Уменьшение размерности w2v до 150

In [131]:
model_2 = gensim.models.Word2Vec(sents, size=150, window=2, min_count=5, workers=4)

In [132]:
df_vecs_2 = []
for sent in sents:
    vec = []
    for word in sent:
        try:
            vec.append(model_2[word])
        except:
            pass
    
    
    df_vecs_2.append(sum(vec)/len(vec))

X_train, X_test, y_train, y_test = train_test_split(df_vecs_2, df['class'], test_size=0.33, random_state=42)

In [133]:
smallw2v_gb = GradientBoostingClassifier(n_estimators=250, random_state=24)
smallw2v_rf = RandomForestClassifier(random_state=24)
smallw2v_ada = AdaBoostClassifier(random_state=24)

In [134]:
for cl, name in zip([smallw2v_gb,smallw2v_rf, smallw2v_ada], ['smallw2v_gb','smallw2v_rf','smallw2v_ada']):
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    score = f1_score(y_test, y_pred)
    print(score)
    for_final_table.append(name+' '+str(score))

0.675862068966
0.593350383632
0.662222222222


In [135]:
for item in for_final_table:
    print(item)
    print('______________')

w2v_gb 0.632352941176
______________
w2v_rf 0.55525606469
______________
w2v_ada 0.642685851319
______________
tf_gb 0.855421686747
______________
tf_rf 0.886554621849
______________
tf_ada 0.847107438017
______________
pca_tf_gb 0.915766738661
______________
pca_tf_rf 0.781038374718
______________
pca_tf_ada 0.91914893617
______________
smallw2v_gb 0.675862068966
______________
smallw2v_rf 0.593350383632
______________
smallw2v_ada 0.662222222222
______________
