# Часть 1. Сентимент-анализ базы твитов RuTweetCorp

## Загрузим и изучим данные 

In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = ['id', 'tdate', 'tmane', 'ttext', 'ttype', 'trep', 'trtw', 'tfav', 'tstcount', 'tfol', 'tfrien', 'listcount']
pos = pd.read_csv('positive.csv', sep=';', names=columns)
print(pos.shape)
neg = pd.read_csv('negative.csv', sep=';', names=columns)
print(neg.shape)


(114911, 12)
(111923, 12)


Размер выборок положительного и отрицательного окраса почти одинаков, нам не нужно будет что-то с этим делать.


Посмотрим, нет ли в выборках дупликатов:

In [3]:
len(pos['ttext'].value_counts()), len(neg['ttext'].value_counts())

(110396, 107044)

Есть несколько тысяч в каждой выборке, избавимся от них

In [4]:
pos = pos.drop_duplicates(subset=['ttext'])
print(pos.shape)
neg = neg.drop_duplicates(subset=['ttext'])
print(neg.shape)

(110396, 12)
(107044, 12)


Построим простейшую модель на связке "мешка слов" и логистической регрессии, которая будет нашим baseline'ом.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import f1_score

import numpy as np
import re

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(pos['ttext'].append(neg['ttext']))
y = pos['ttype'].append(neg['ttype'])
clf=LogisticRegression(solver='lbfgs')
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')

print(np.mean(scores))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.7398756959058405


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Итак, наш baseline -- 0.74

Посмотрим на данные поближе. 

In [6]:
pos.head()

Unnamed: 0,id,tdate,tmane,ttext,ttype,trep,trtw,tfav,tstcount,tfol,tfrien,listcount
0,408906692374446080,1386325927,pleease_shut_up,"@first_timee хоть я и школота, но поверь, у на...",1,0,0,0,7569,62,61,0
1,408906692693221377,1386325927,alinakirpicheva,"Да, все-таки он немного похож на него. Но мой ...",1,0,0,0,11825,59,31,2
2,408906695083954177,1386325927,EvgeshaRe,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,1,0,1,0,1273,26,27,0
3,408906695356973056,1386325927,ikonnikova_21,"RT @digger2912: ""Кто то в углу сидит и погибае...",1,0,1,0,1549,19,17,0
4,408906761416867842,1386325943,JumpyAlex,@irina_dyshkant Вот что значит страшилка :D\nН...,1,0,0,0,597,16,23,1


Надо удалить все тегирования пользователей, хештеги, ретвиты, ссылки и системные символы.
Радостные смайлы я заменю на признак good_flag, а расстроенные на bad_flag. 


In [7]:
def cleaner(documents):
    """
    Replaces urls, hashtags and usernames with spaces.
    Replaces smiles with special tags.
    :param documents: list: of str
    :returns docs: list: of str
    """
    docs = list()

    for doc in documents:
        text = re.sub("(@\w+)|(#\w+)", " ", doc.lower())
    
        text = re.sub("\n", " ", text)
        text = re.sub("(\w+:\/\/\S+)", " ", text)
        text = re.sub("rt ", " ", text)
        text = re.sub(" rt ", " ", text)
        text = re.sub(":\(", " bad_flag ", text)
        text = re.sub("\(\(+", " bad_flag ", text)
        text = re.sub("99+", " bad_flag ", text)
        text = re.sub("0_0", " bad_flag ", text)
        text = re.sub("o_o", " bad_flag ", text)
        text = re.sub("о_о", " bad_flag ", text)
        text = re.sub(":-\(", " bad_flag ", text)
        text = re.sub("=\(", " bad_flag ", text)
        text = re.sub(" \(", " bad_flag ", text)
#         text = re.sub("\(", " bad_flag ", text)
        text = re.sub(";\)", " good_flag ", text)
        text = re.sub(":d+", " good_flag ", text)
        text = re.sub("\=\)+", " good_flag ", text)
        text = re.sub("\)+", " good_flag ", text)
        text = re.sub(":\)", " good_flag ", text)
        
        text = re.sub(r'[^\w\s]','' , text)
        

        text = text.strip()
        docs.append(text)

    return docs


def get_features_by_weights(vectorizer, model, number_of_features=10):
    ## get feature weights
    feature_names = vectorizer.get_feature_names()
    coefs = model.coef_[0]
#     print(np.argsort(coefs))
    top_features_pos = np.argsort(coefs)[-number_of_features:]
    top_features_neg = np.argsort(coefs)[:number_of_features]
    features = [(feature_names[pos_feature_place], feature_names[neg_feature_place])  for pos_feature_place, neg_feature_place in zip(top_features_pos[::-1], top_features_neg)]
    return features



In [8]:
pos['ttext'] = cleaner(pos['ttext'])
neg['ttext'] = cleaner(neg['ttext'])


In [9]:
pos = pos.drop_duplicates(subset=['ttext'])
print(pos.shape)
neg = neg.drop_duplicates(subset=['ttext'])
print(neg.shape)

(107481, 12)
(104008, 12)


In [10]:
X = vectorizer.fit_transform(pos['ttext'].append(neg['ttext']))
y = pos['ttype'].append(neg['ttype'])

clf = LogisticRegression(solver='lbfgs')
scores = cross_val_score(clf, X, y, cv=3, scoring='f1_macro')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
model = clf.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
np.mean(scores)

0.9845993440438895

Результат намного лучше baseline'а. Посмотрим для интереса на feature importance

In [13]:
get_features_by_weights(vectorizer, model)

[('good_flag', 'bad_flag'),
 ('gtd', 'сожалению'),
 ('обожаю', 'боюсь'),
 ('поржали', 'обидно'),
 ('скажу', 'жалко'),
 ('приятно', 'походу'),
 ('рада', 'печально'),
 ('любимая', 'печаль'),
 ('ddd', 'стыдно'),
 ('люблю', 'увы')]

Получается, что смайл является самым важным параметром и в отрицательных и в положительных сообщениях. Изучим это чуть подробнее.

In [14]:
data_train = pd.concat([pos[['ttext', 'ttype']], neg[['ttext', 'ttype']]])
# data_train.head()
data_train.rename(columns={"ttext": "body"}, inplace = True)
data_train.rename(columns={"ttype": "label"}, inplace = True)

In [15]:
print("Количество записей, в которых нет смайла:", data_train.loc[(~data_train['body'].str.contains('good_flag')) & (~data_train['body'].str.contains('bad_flag'))].shape[0] / data_train.shape[0])

Количество записей, в которых нет смайла: 0.17232574743840104


Получается, что смайл присутствует в 83% твитов.  

Посмотрим на корреляцию наличия смайла и метки сентимента

In [16]:
def what_smile(body):
    if ('good_flag' in body) & ('bad_flag' in body):
        return 0
    elif 'good_flag' in body:
        return 1
    elif 'bad_flag' in body:
        return -1
    else:
        return 0


In [17]:
data_train['what_smile'] = data_train['body'].apply(what_smile)

In [18]:
data_train.corr()

Unnamed: 0,label,what_smile
label,1.0,0.919745
what_smile,0.919745,1.0


Корреляция почти 90%, наличие смайла очень сильный признак.

Что ж, самое время пройтись по сетке параметров и найти лучшую модель.

## Поиск лучшей модели

In [19]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer



In [20]:
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier()),
])


param_grid = [    
    {
        'vect': [TfidfVectorizer()],
        'vect__max_df': (0.25, 0.5, 0.75),
        'vect__ngram_range': ((1, 1), (1, 2)),
    },
    {
        'vect': [CountVectorizer()],
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'vect__ngram_range': ((1, 1), (1, 2))
#         'regr__alpha': np.logspace(-4, 1, 6),
    },

    {
        'clf': [SGDClassifier()],
        'clf__max_iter': (20, 25),
        'clf__alpha': (0.00001, 0.000001),
        'clf__penalty': ('l2', 'elasticnet'),
    },
    {
        'clf': [LogisticRegression()],
        'clf__C': np.logspace(-3,3, 4),
        'clf__penalty': ('l2', 'l2'),
    },
    {   'clf': [XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)],
        'clf__min_child_weight': [0.5, 1],
        'clf__gamma': [5, 10],
        'clf__subsample': [0.7, 1.0],
        'clf__colsample_bytree': [0.5, 1.0],
        'clf__max_depth': [8, 12]
        }

]



In [21]:
from sklearn.metrics import f1_score, make_scorer

f1 = make_scorer(f1_score , average='macro')

In [22]:
grid_search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=1, scoring=f1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe.steps])
print("parameters:")
print(param_grid)
grid_search.fit(data_train['body'], data_train['label'])

print("Best score: %0.6f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
print(best_parameters)

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
[{'vect': [TfidfVectorizer()], 'vect__max_df': (0.25, 0.5, 0.75), 'vect__ngram_range': ((1, 1), (1, 2))}, {'vect': [CountVectorizer()], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000), 'vect__ngram_range': ((1, 1), (1, 2))}, {'clf': [SGDClassifier()], 'clf__max_iter': (20, 25), 'clf__alpha': (1e-05, 1e-06), 'clf__penalty': ('l2', 'elasticnet')}, {'clf': [LogisticRegression()], 'clf__C': array([1.e-03, 1.e-01, 1.e+01, 1.e+03]), 'clf__penalty': ('l2', 'l2')}, {'clf': [XGBClassifier(alpha=10, colsample_bytree=0.3, max_depth=5, n_estimators=10)], 'clf__min_child_weight': [0.5, 1], 'clf__gamma': [5, 10], 'clf__subsample': [0.7, 1.0], 'clf__colsample_bytree': [0.5, 1.0], 'clf__max_depth': [8, 12]}]
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 51.2min finished


Best score: 0.984594
Best parameters set:
{'memory': None, 'steps': [('vect', CountVectorizer()), ('clf', SGDClassifier(alpha=1e-05, max_iter=25, penalty='elasticnet'))], 'verbose': False, 'vect': CountVectorizer(), 'clf': SGDClassifier(alpha=1e-05, max_iter=25, penalty='elasticnet'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'clf__alpha': 1e-05, 'clf__average': False, 'clf__class_weight': None, 'clf__early_stopping': False, 'clf__epsilon': 0.1, 'clf__eta0': 0.0, 'clf__fit_intercept': True, 'clf__l1_ratio': 0.15, 'clf__learning_rate': 'optimal', 'clf__loss': 'hing

Итак, лучшая текущая модель дала нам f1-score: 0.984594

Попробуем теперь дообучить какую-нибудь нейронную сеть, например fasttext

In [23]:
from fasttext import train_supervised, load_model
from sklearn.model_selection import StratifiedKFold


In [24]:
models = {}
params = dict()
params['epoch'] = [30, 40]
params['lr'] = [1, 0.1]

params['min_count'] = [5, 10]

params['word_ngrams'] = [1, 2]
params['dim'] = [100, 200]
params['loss'] = ['softmax']

model_results = pd.DataFrame(columns=['model', 'type', 'precision', 'recall', 'f1-score'])
types = ['pos', 'neg']


Отформатируем данные для использования моделью FastText и разделим их для дальнейшей кросс-валидации

In [25]:
def fasttext_classification_metrics(target_class, predicted_class):
    result = metrics.classification_report(target_class, predicted_class, target_names=['pos', 'neg'], output_dict=True)
    return result


df = data_train


new_folds = []

skf = StratifiedKFold(n_splits=3, shuffle=True)
folds = skf.get_n_splits(df['body'], df['label'])
for train_index, test_index in skf.split(data_train['body'], data_train['label']):
    new_folds.append([df.iloc[train_index], df.iloc[test_index]])

# print(new_folds)

folds = new_folds


kfold_metrics = []

#paths to write files in fasttext format
train_path = 'data_train'
test_path = 'data_test'


In [26]:
def to_ft_format(documents, labels, file_path):
    """
    Converts and save a dataset to fasttext compliant training format.
    
    :param documents: list: of str
    :param labels: list: of str/int
    :file_path: str
    """
    
    # Add mandatory "__label__" prefix to the labels as required by fasttext
    labels = ["__label__"+str(label) for label in labels]
    
    # clean up documents
    documents = cleaner(documents)
    
    with open(file_path, 'w', encoding="utf-8") as f:
        for doc, label in zip(documents, labels):
            f.write(label + " " + doc + "\n")
            
    print("Output file with %d samples saved at location: %s"%(len(labels), file_path))

In [27]:
to_ft_format([x for x in data_train['body']], [x for x in data_train['label']], 'data_train.txt')


Output file with 211489 samples saved at location: data_train.txt


К сожалению, в conda нет GridSearch для FastText, поэтому просто переберём параметры вручную по нескольким фолдам, подготовленным ранее. 

In [28]:
#model = train_supervised(input='data_train.txt',
#     loss='softmax', minCount=5, dim=100, lr=0.15, ws=5, epoch = 40, wordNgrams=4, thread=4)

model_results = pd.DataFrame()
modelnum = 1

for dim in params['dim']:
    for param_loss in params['loss']:
        for epoch in params['epoch']:
            for lr in params['lr']:
                for min_count in params['min_count']:
                    for word_ngrams in params['word_ngrams']:
                        kfold_metrics = []
                        for fold in folds:
                            train_df = fold[0]
                            test_df = fold[1]
                            to_ft_format(train_df['body'], train_df['label'], train_path)
                            to_ft_format(test_df['body'], test_df['label'], test_path)
                            print(epoch, lr, min_count, word_ngrams)
                            model = train_supervised(input=train_path, loss=param_loss, minCount=min_count,  dim=dim, lr=lr, ws=5, epoch = epoch, wordNgrams=word_ngrams, thread=8)
                            modelnum +=1
                            
                            test_X = cleaner(test_df['body'])
                            predictions = [model.predict(doc)[0][0].replace("__label__", "") for doc in test_X]
                            
                            result = metrics.classification_report(test_df['label'].astype(str), predictions, target_names=('pos', 'neg'), output_dict=True)
                            kfold_metrics.append(result)
                        print(kfold_metrics)
#                         for label in types:
#                         cross_precision = np.mean([elem[label]['precision'] for elem in kfold_metrics])
#                         cross_recall = np.mean([elem[label]['recall'] for elem in kfold_metrics])
                        cross_f1_score = np.mean([elem['macro avg']['f1-score'] for elem in kfold_metrics])
                        current_model = pd.DataFrame([["model_num:" + str(model) + 'epoch:' + str(epoch) + 'lr:' + str(lr)  + ' ,min_count:' + str(min_count) + ' ,word_ngrams:' + str(word_ngrams) + ' ,loss:' + str(param_loss) + ' ,dim:' + str(dim), cross_f1_score]], columns=['model', 'f1-score'])
                        model_results = pd.concat([model_results, current_model])

models

Output file with 140992 samples saved at location: data_train
Output file with 70497 samples saved at location: data_test
30 1 5 1
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
30 1 5 1
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
30 1 5 1
[{'pos': {'precision': 0.9808549170667203, 'recall': 0.9841649841361407, 'f1-score': 0.982507162704983, 'support': 34670}, 'neg': {'precision': 0.9846261551386166, 'recall': 0.9814106679320066, 'f1-score': 0.983015782042859, 'support': 35827}, 'accuracy': 0.9827652240520873, 'macro avg': {'precision': 0.9827405361026684, 'recall': 0.9827878260340737, 'f1-score': 0.982761472373921, 'support': 70497}, 'weighted avg': {'precision': 0.9827714829688414, 'recall': 0.9827652240520873, 'f1-score': 0.982765646115881, 'support': 70497}}, {'pos': {'precision': 0.983101105481345, 'recall': 0.9850010095474343, 'f

Output file with 70496 samples saved at location: data_test
30 0.1 5 1
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
30 0.1 5 1
[{'pos': {'precision': 0.9813750287422396, 'recall': 0.9848283818863571, 'f1-score': 0.9830986726555528, 'support': 34670}, 'neg': {'precision': 0.9852681697241282, 'recall': 0.9819130823122226, 'f1-score': 0.983587764916401, 'support': 35827}, 'accuracy': 0.9833468090840745, 'macro avg': {'precision': 0.9833215992331839, 'recall': 0.9833707320992899, 'f1-score': 0.983343218785977, 'support': 70497}, 'weighted avg': {'precision': 0.983353546437434, 'recall': 0.9833468090840745, 'f1-score': 0.9833472322882948, 'support': 70497}}, {'pos': {'precision': 0.9847967102976949, 'recall': 0.9809051313853875, 'f1-score': 0.9828470686839785, 'support': 34669}, 'neg': {'precision': 0.981592703814926, 'recall': 0.9853462472436989, 'f1-score': 0.9834658940535721, 'support': 35827}, 'accuracy': 0.983

[{'pos': {'precision': 0.9810576618591128, 'recall': 0.9829535621574849, 'f1-score': 0.9820046969325861, 'support': 34670}, 'neg': {'precision': 0.9834731543624161, 'recall': 0.9816339632121026, 'f1-score': 0.9825526981155797, 'support': 35827}, 'accuracy': 0.9822829340255613, 'macro avg': {'precision': 0.9822654081107645, 'recall': 0.9822937626847937, 'f1-score': 0.9822786975240829, 'support': 70497}, 'weighted avg': {'precision': 0.9822852296976854, 'recall': 0.9822829340255613, 'f1-score': 0.982283194434368, 'support': 70497}}, {'pos': {'precision': 0.9840434757472394, 'recall': 0.9819146788196949, 'f1-score': 0.9829779247217129, 'support': 34669}, 'neg': {'precision': 0.9825357918778899, 'recall': 0.9845926256733748, 'f1-score': 0.9835631334606645, 'support': 35827}, 'accuracy': 0.9832756468452111, 'macro avg': {'precision': 0.9832896338125646, 'recall': 0.9832536522465348, 'f1-score': 0.9832705290911887, 'support': 70496}, 'weighted avg': {'precision': 0.9832772508552287, 'recall'

Output file with 140992 samples saved at location: data_train
Output file with 70497 samples saved at location: data_test
40 0.1 5 2
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
40 0.1 5 2
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
40 0.1 5 2
[{'pos': {'precision': 0.979264176098488, 'recall': 0.9957311796942602, 'f1-score': 0.9874290290748394, 'support': 34670}, 'neg': {'precision': 0.9958007036658723, 'recall': 0.9795963937812264, 'f1-score': 0.9876320862236355, 'support': 35827}, 'accuracy': 0.9875313843142262, 'macro avg': {'precision': 0.9875324398821801, 'recall': 0.9876637867377434, 'f1-score': 0.9875305576492375, 'support': 70497}, 'weighted avg': {'precision': 0.9876681390069335, 'recall': 0.9875313843142262, 'f1-score': 0.9875322239408609, 'support': 70497}}, {'pos': {'precision': 0.9810650214653285, 'recall': 0.9953272375

Output file with 70496 samples saved at location: data_test
30 1 5 2
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
30 1 5 2
[{'pos': {'precision': 0.9796434046902505, 'recall': 0.9952408422267089, 'f1-score': 0.9873805299605105, 'support': 34670}, 'neg': {'precision': 0.9953224663359319, 'recall': 0.9799871605213945, 'f1-score': 0.9875952856459733, 'support': 35827}, 'accuracy': 0.9874888293118856, 'macro avg': {'precision': 0.9874829355130912, 'recall': 0.9876140013740518, 'f1-score': 0.9874879078032419, 'support': 70497}, 'weighted avg': {'precision': 0.9876115982528111, 'recall': 0.9874888293118856, 'f1-score': 0.9874896700933257, 'support': 70497}}, {'pos': {'precision': 0.9814081940608718, 'recall': 0.9942600017306528, 'f1-score': 0.9877922971114168, 'support': 34669}, 'neg': {'precision': 0.9943742402397309, 'recall': 0.9817735227621626, 'f1-score': 0.9880337078651686, 'support': 35827}, 'accuracy': 0.987

[{'pos': {'precision': 0.979264176098488, 'recall': 0.9957311796942602, 'f1-score': 0.9874290290748394, 'support': 34670}, 'neg': {'precision': 0.9958007036658723, 'recall': 0.9795963937812264, 'f1-score': 0.9876320862236355, 'support': 35827}, 'accuracy': 0.9875313843142262, 'macro avg': {'precision': 0.9875324398821801, 'recall': 0.9876637867377434, 'f1-score': 0.9875305576492375, 'support': 70497}, 'weighted avg': {'precision': 0.9876681390069335, 'recall': 0.9875313843142262, 'f1-score': 0.9875322239408609, 'support': 70497}}, {'pos': {'precision': 0.9813945549202014, 'recall': 0.9950387954656899, 'f1-score': 0.9881695789172157, 'support': 34669}, 'neg': {'precision': 0.9951336822747207, 'recall': 0.9817456108521506, 'f1-score': 0.9883943123700332, 'support': 35827}, 'accuracy': 0.9882830231502496, 'macro avg': {'precision': 0.988264118597461, 'recall': 0.9883922031589203, 'f1-score': 0.9882819456436245, 'support': 70496}, 'weighted avg': {'precision': 0.9883769612373028, 'recall':

Output file with 140992 samples saved at location: data_train
Output file with 70497 samples saved at location: data_test
40 1 10 1
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
40 1 10 1
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
40 1 10 1
[{'pos': {'precision': 0.9813546642113138, 'recall': 0.983732333429478, 'f1-score': 0.9825420603825767, 'support': 34670}, 'neg': {'precision': 0.9842206865679993, 'recall': 0.9819130823122226, 'f1-score': 0.9830655302501049, 'support': 35827}, 'accuracy': 0.9828077790544278, 'macro avg': {'precision': 0.9827876753896565, 'recall': 0.9828227078708502, 'f1-score': 0.9828037953163409, 'support': 70497}, 'weighted avg': {'precision': 0.9828111940348946, 'recall': 0.9828077790544278, 'f1-score': 0.982808090922088, 'support': 70497}}, {'pos': {'precision': 0.9828921978053627, 'recall': 0.98436643687444

Output file with 70496 samples saved at location: data_test
40 0.1 10 1
Output file with 140993 samples saved at location: data_train
Output file with 70496 samples saved at location: data_test
40 0.1 10 1
[{'pos': {'precision': 0.9807951772642457, 'recall': 0.9854629362561292, 'f1-score': 0.9831235162937918, 'support': 34670}, 'neg': {'precision': 0.9858673097414615, 'recall': 0.9813269322019705, 'f1-score': 0.9835918812684469, 'support': 35827}, 'accuracy': 0.9833609940848547, 'macro avg': {'precision': 0.9833312435028536, 'recall': 0.9833949342290499, 'f1-score': 0.9833576987811193, 'support': 70497}, 'weighted avg': {'precision': 0.9833728655383739, 'recall': 0.9833609940848547, 'f1-score': 0.983361542194851, 'support': 70497}}, {'pos': {'precision': 0.9828786832412523, 'recall': 0.9852317632467046, 'f1-score': 0.9840538165684736, 'support': 34669}, 'neg': {'precision': 0.9856759176365264, 'recall': 0.9833924135428588, 'f1-score': 0.984532841514021, 'support': 35827}, 'accuracy': 0

{}

In [31]:
model_results.sort_values(by=['f1-score'], ascending=False)

Unnamed: 0,model,f1-score
0,model_num:<fasttext.FastText._FastText object ...,0.987677
0,model_num:<fasttext.FastText._FastText object ...,0.987616
0,model_num:<fasttext.FastText._FastText object ...,0.987611
0,model_num:<fasttext.FastText._FastText object ...,0.987606
0,model_num:<fasttext.FastText._FastText object ...,0.987601
0,model_num:<fasttext.FastText._FastText object ...,0.987597
0,model_num:<fasttext.FastText._FastText object ...,0.987573
0,model_num:<fasttext.FastText._FastText object ...,0.987511
0,model_num:<fasttext.FastText._FastText object ...,0.987488
0,model_num:<fasttext.FastText._FastText object ...,0.987436


Итого, лучшая FastText модель дала нам f1-score 0.987677, а лучшая классическая модель -- 0.984594


В дальнейшем можно будет попытаться улучшить модель на основе использования мета-параметров:
* Количеста хештегов в сообщении
* Времени постинга твита(утро-день-вечер-ночь)
* Длины твита

итд

Также, можно будет попробовать дообучить BERT