In [140]:
import pandas as pd
import xml.etree.ElementTree as et
import numpy as np


def parse_XML(xml_file, df_cols): 
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []

    for node in xroot: 
        res = ["NULL",] * len(df_cols)
        for col in node:
            for idx, el in enumerate(df_cols): 
                if col.attrib.get("name") == el:
                    res[idx] = col.text
        rows.append(res)

    out_df = pd.DataFrame(rows, columns=df_cols)
    nrow, ncol = out_df.shape 
    target = [np.nan]*nrow
    for i in range(nrow):
        if out_df.loc[i,'beeline'] !='NULL':
            target[i] = out_df.loc[i,'beeline']
        elif out_df.loc[i,'mts'] !='NULL':
            target[i] = out_df.loc[i,'mts']
        elif out_df.loc[i,'megafon'] !='NULL':
            target[i] = out_df.loc[i,'megafon']
        elif out_df.loc[i,'tele2'] !='NULL':
            target[i] = out_df.loc[i,'tele2']
        elif out_df.loc[i,'rostelecom'] !='NULL':
            target[i] = out_df.loc[i,'rostelecom']
        elif out_df.loc[i,'komstar'] !='NULL':
            target[i] = out_df.loc[i,'komstar']
        elif out_df.loc[i,'skylink'] !='NULL':
            target[i] = out_df.loc[i,'skylink']
        else:
            target[i] = 'NAN'

    out_df = out_df.drop(columns = ["beeline","mts","megafon",
                        "tele2","rostelecom","komstar","skylink", "id", "twitid", "date"])
    out_df['target'] = target

    out_df = out_df.drop(out_df[out_df['target'].isin(['--','+-','NAN'])].index)
    out_df['target'] = out_df['target'].apply(pd.to_numeric, errors='coerce')
    return out_df


a = parse_XML("C://Users//Frederik//Desktop//train.xml", ["id", "twitid", "date", "text", "beeline", "mts", "megafon", "tele2", "rostelecom", "komstar","skylink"])
b = parse_XML("C://Users//Frederik//Desktop//test_etalon.xml", ["id", "twitid", "date", "text", "beeline", "mts", "megafon", "tele2", "rostelecom", "komstar","skylink"])

In [141]:
from sklearn.feature_extraction.text import CountVectorizer
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
import string


def senttoterm(sents):
    res = []
    for sent in sents:
        terms = ""
        temp = sent.split()
        for j in temp:
            fl = True
            while fl:
                fl = False
                for p in string.punctuation:
                    if p in j:
                        j = j.replace(p, '')
                        fl = True
            if j != "":
                j = j.lower()
                q = morph.parse(j)[0].normal_form
                terms = terms + q + " " 
        res.append(terms)
    return res

In [None]:
                                                    SVM + frequencies

In [154]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [157]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [158]:
pred = clf.predict(Xtest)

In [166]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7237971391417425

In [165]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.4297352703595443
0.7237971391417426
0.6631867498170302


In [None]:
                                                    SVM + bool

In [169]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [170]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [171]:
pred = clf.predict(Xtest)

In [172]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7347204161248374

In [173]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.44612056379354836
0.7347204161248374
0.6780201481252234


In [None]:
                                                    SVM + tf-idf

In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [176]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [180]:
pred = clf.predict(Xtest)

In [181]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7368010403120936

In [182]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.46371162490258994
0.7368010403120936
0.6874216810497694


In [None]:
                                                    SVM + frequencies + stop-words

In [184]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [185]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [186]:
pred = clf.predict(Xtest)

In [187]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7263979193758128

In [188]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.43077481561825737
0.7263979193758129
0.665135345614521


In [None]:
                                                    SVM + bool + stop-words

In [189]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [190]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [191]:
pred = clf.predict(Xtest)

In [192]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.729518855656697

In [193]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.4353722378747153
0.729518855656697
0.6692032415126267


In [None]:
                                                    SVM + tf-idf + stop-words

In [194]:
from sklearn.feature_extraction.text import TfidfVectorizer
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [195]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [196]:
pred = clf.predict(Xtest)

In [197]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7282184655396619

In [None]:
                                                    SVM + bool

In [169]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [170]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [171]:
pred = clf.predict(Xtest)

In [172]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7347204161248374

In [173]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.44612056379354836
0.7347204161248374
0.6780201481252234


In [None]:
                                                    SVM + tf-idf

In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [176]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [180]:
pred = clf.predict(Xtest)

In [181]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7368010403120936

In [182]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.46371162490258994
0.7368010403120936
0.6874216810497694


In [None]:
                                                    SVM + frequencies + stop-words

In [184]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [185]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [186]:
pred = clf.predict(Xtest)

In [187]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7263979193758128

In [188]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.43077481561825737
0.7263979193758129
0.665135345614521


In [None]:
                                                    SVM + bool + stop-words

In [189]:
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [190]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [191]:
pred = clf.predict(Xtest)

In [192]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.729518855656697

In [193]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.4353722378747153
0.729518855656697
0.6692032415126267


In [None]:
                                                    SVM + tf-idf + stop-words

In [194]:
from sklearn.feature_extraction.text import TfidfVectorizer
a["text"] = senttoterm(a["text"])
b["text"] = senttoterm(b["text"])
collection = list(a["text"]) + list(b["text"])

vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

In [195]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(Xtrain, a["target"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [196]:
pred = clf.predict(Xtest)

In [197]:
from sklearn.metrics import accuracy_score
accuracy_score(b["target"], pred)

0.7282184655396619

In [198]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.4476818555803108
0.7282184655396619
0.6741142455338077


In [198]:
from sklearn.metrics import f1_score
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.4476818555803108
0.7282184655396619
0.6741142455338077


In [None]:
                                                    RidgeClassifier + frequencies

In [200]:
from sklearn.linear_model import RidgeClassifier

vectorizer = CountVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = RidgeClassifier().fit(Xtrain, a["target"])


pred = clf.predict(Xtest)

print(accuracy_score(b["target"], prved))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.5638268662164495
0.6697009102730819
0.6854478190511815


In [None]:
                                                    RidgeClassifier + bool

In [201]:
vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = RidgeClassifier().fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6736020806241872
0.5651055599953573
0.6736020806241872
0.6887817145504974


In [None]:
                                                    RidgeClassifier + tf-idf

In [202]:
vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = RidgeClassifier().fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7003901170351106
0.5825209081961783
0.7003901170351106
0.7080310337938136


In [None]:
                                                    RidgeClassifier + frequencies + stop-words

In [203]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = RidgeClassifier().fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6660598179453836
0.5601947779400697
0.6660598179453836
0.6816936711188714


In [None]:
                                                    RidgeClassifier + bool + stop-words

In [204]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = RidgeClassifier().fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6676202860858258
0.5616735142347299
0.6676202860858258
0.6829790393569223


In [None]:
                                                   RidgeClassifier + tf-idf + stop-words

In [210]:
vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = RidgeClassifier().fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7003901170351106
0.5815393653880093
0.7003901170351106
0.7079427095366992


In [None]:
                                                     Tree + frequencies

In [207]:
from sklearn import tree

vectorizer = CountVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = tree.DecisionTreeClassifier()
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.5906371911573473
0.4799496948239306
0.5906371911573473
0.6117509264028442


In [None]:
                                                      Tree + bool

In [208]:
vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = tree.DecisionTreeClassifier()
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.5791937581274382
0.46528366225292844
0.5791937581274382
0.6037415140175968


In [None]:
                                                    Tree + Tfidf

In [209]:
vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = tree.DecisionTreeClassifier()
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.565409622886866
0.4559360085401467
0.565409622886866
0.5870514778051633


In [None]:
                                                    Tree + frequencies + stop-words

In [212]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = tree.DecisionTreeClassifier()
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.5992197659297789
0.49336183475132955
0.5992197659297789
0.6210901160643497


In [None]:
                                                        Tree + bool + stop-words

In [213]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = tree.DecisionTreeClassifier()
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.5950585175552666
0.4837167418957206
0.5950585175552666
0.6168429078789522


In [None]:
                                                        Tree + TfIdf + stop-words

In [214]:
vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = tree.DecisionTreeClassifier()
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6007802340702211
0.4865718468696327
0.6007802340702211
0.6210005076325182


In [None]:
                                                        XGBoost + frequencies

In [224]:
from sklearn import ensemble

vectorizer = CountVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

reg = ensemble.GradientBoostingRegressor()
reg.fit(Xtrain, a["target"])

pred = list(map(round,reg.predict(Xtest)))

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6884265279583875
0.3634710121900114
0.6884265279583875
0.6056726235213612


In [None]:
                                                      XGBoost + bool

In [225]:
vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

reg = ensemble.GradientBoostingRegressor()
reg.fit(Xtrain, a["target"])

pred = list(map(round,reg.predict(Xtest)))

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6889466840052015
0.3703134082711124
0.6889466840052015
0.6096847564034826


In [None]:
                                                    XGBoost + Tfidf

In [226]:
vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

reg = ensemble.GradientBoostingRegressor()
reg.fit(Xtrain, a["target"])

pred = list(map(round,reg.predict(Xtest)))

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6918075422626788
0.38032800081563334
0.6918075422626788
0.614498197665143


In [None]:
                                                    XGBoost + frequencies + stop-words

In [227]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

reg = ensemble.GradientBoostingRegressor()
reg.fit(Xtrain, a["target"])

pred = list(map(round,reg.predict(Xtest)))

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6912873862158647
0.3685233582159013
0.6912873862158647
0.6091184756719872


In [None]:
                                                        XGBoost + bool + stop-words

In [228]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

reg = ensemble.GradientBoostingRegressor()
reg.fit(Xtrain, a["target"])

pred = list(map(round,reg.predict(Xtest)))

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.694148244473342
0.37454798917718035
0.694148244473342
0.6132775855846775


In [None]:
                                                        XGBoost + TfIdf + stop-words

In [229]:
vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

reg = ensemble.GradientBoostingRegressor()
reg.fit(Xtrain, a["target"])

pred = list(map(round,reg.predict(Xtest)))

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6918075422626788
0.3724564775541279
0.6918075422626788
0.6114878169798192


In [None]:
                                                      SGD Classifier + frequencies

In [230]:
from sklearn.linear_model import SGDClassifier

vectorizer = CountVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7313394018205461
0.5088761553480652
0.7313394018205462
0.6981637951723694


In [None]:
                                                      SGD Classifier + bool

In [231]:
vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7071521456436931
0.4099224065367462
0.7071521456436931
0.6370721681057432


In [None]:
                                                    SGD Classifier + Tfidf

In [232]:
vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7396618985695709
0.5245265791503186
0.739661898569571
0.7098653550314208


In [None]:
                                                    SGD Classifier + frequencies + stop-words

In [233]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7006501950585176
0.38215057504776556
0.7006501950585176
0.6188659060571302


In [None]:
                                                        SGD Classifier + bool + stop-words

In [234]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.7368010403120936
0.5285048014349959
0.7368010403120936
0.7107080634026807


In [None]:
                                                        SGD Classifier + TfIdf + stop-words

In [235]:
vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(Xtrain, a["target"])

pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6918075422626788
0.3500206022933296
0.6918075422626788
0.5954800117721268


In [None]:
                                                      MLP Classifier + frequencies

In [236]:
from sklearn.neural_network import MLPClassifier

vectorizer = CountVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = MLPClassifier().fit(Xtrain, a["target"])
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [237]:
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6291287386215865
0.5378499041397239
0.6291287386215865
0.6567249513129668


In [None]:
                                                      MLP Classifier + bool

In [238]:
vectorizer = CountVectorizer(binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = MLPClassifier().fit(Xtrain, a["target"])
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.635110533159948
0.5429267937379866
0.635110533159948
0.6623129023305587


In [None]:
                                                    MLP Classifier + Tfidf

In [239]:
vectorizer = TfidfVectorizer()
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = MLPClassifier().fit(Xtrain, a["target"])
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6416124837451236
0.5378496077121965
0.6416124837451236
0.6631861901769334


In [None]:
                                                    MLP Classifier + frequencies + stop-words

In [240]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = MLPClassifier().fit(Xtrain, a["target"])
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6338101430429128
0.5399253812786872
0.6338101430429128
0.6608159627347614


In [None]:
                                                        MLP Classifier + bool + stop-words

In [241]:
vectorizer = CountVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"], binary = True)
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = MLPClassifier().fit(Xtrain, a["target"])
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6361508452535761
0.5440000499112531
0.6361508452535761
0.6614095567913024


In [None]:
                                                        MLP Classifier + TfIdf + stop-words

In [242]:
vectorizer = TfidfVectorizer(stop_words = ["он", "мы", "его", "вы", "вам", "вас", "ее", "что", "который", "их", "все", "они", "я", "весь", "мне", "меня", "таким", "для", "на", "по", "со", "из", "от", "до", "без", "над", "под", "за", "при", "после", "во", "же", "то", "бы", "всего", "итого", "даже", "да"])
lems = vectorizer.fit_transform(collection).toarray()
Xtrain = lems[0:len(a["text"])]
Xtest = lems[len(a["text"])::]

clf = MLPClassifier().fit(Xtrain, a["target"])
pred = clf.predict(Xtest)

print(accuracy_score(b["target"], pred))
print(f1_score(b["target"], pred, average='macro'))
print(f1_score(b["target"], pred, average='micro'))
print(f1_score(b["target"], pred, average='weighted'))

0.6330299089726918
0.5328338621843701
0.6330299089726918
0.6566076523353429
