# Tweets

In [110]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostClassifier

import re


## Data preparation

In [61]:
OUTPUT_SEPARATOR = 'ʕ•́ᴥ•̀ʔっ♡ ¯\_(ツ)_/¯ (─‿‿─) 〈( ^.^)ノ ʕ•́ᴥ•̀ʔっ♡ ¯\_(ツ)_/¯ (─‿‿─) 〈( ^.^)ノ ʕ•́ᴥ•̀ʔっ♡'

TYPE_NEGATIVE = -1
TYPE_NEUTRAL = 0
TYPE_POSITIVE = 1

negative_df = pd.DataFrame(pd.read_csv('./data/processedNegative.csv').columns, columns=['text'])
neutral_df = pd.DataFrame(pd.read_csv('./data/processedNeutral.csv').columns, columns=['text'])
positive_df = pd.DataFrame(pd.read_csv('./data/processedPositive.csv').columns, columns=['text'])

print(f'negative_df shape: {negative_df.shape}')
print(f'neutral_df shape: {neutral_df.shape}')
print(f'positive_df shape: {positive_df.shape}')

negative_df['type'] = TYPE_NEGATIVE
neutral_df['type'] = TYPE_NEUTRAL
positive_df['type'] = TYPE_POSITIVE
df = pd.concat([negative_df, positive_df, neutral_df])

print(f'df shape: {df.shape}')
print('count types')
print(df['type'].value_counts())

negative_df shape: (1117, 1)
neutral_df shape: (1570, 1)
positive_df shape: (1186, 1)
df shape: (3873, 2)
count types
 0    1570
 1    1186
-1    1117
Name: type, dtype: int64


### Drop duplicates

In [63]:
def drop_duplications_preparation(tweet):
    tweet_wituot_digits = re.sub(r"\d+", '', tweet.lower())
    tweet_wituot_digits_and_punctuation = re.sub(r'[^\w\s]', '', tweet_wituot_digits)
    tweet_wituot_digits_and_punctuation_and_add_spaces =  re.sub(' +', ' ', tweet_wituot_digits_and_punctuation)
    return tweet_wituot_digits_and_punctuation_and_add_spaces.strip()

df['text'] = df['text'].apply(drop_duplications_preparation)

In [64]:
df = df.drop_duplicates()

print(f'df shape: {df.shape}')
print('count types')
print(df['type'].value_counts())

df shape: (3417, 2)
count types
 0    1470
-1     974
 1     973
Name: type, dtype: int64


In [65]:
train_df, test_df = train_test_split(df, test_size=0.2)

print('train_df count types')
print(train_df.shape)
print(train_df['type'].value_counts())
print('test_df count types')
print(test_df.shape)
print(test_df['type'].value_counts())

train_df count types
(2733, 2)
 0    1184
 1     779
-1     770
Name: type, dtype: int64
test_df count types
(684, 2)
 0    286
-1    204
 1    194
Name: type, dtype: int64


### Crete preprocessing methods

In [66]:
%%time
from tokenizer import remove_stop_words, stem_tokens, lemmatize_tokens, spell_tokens, get_tokenizer


preprocessing_methods = {
    "Just tokenized":                                     [],
    "Stemmed":                                            [stem_tokens],
    "Stemmed (stopword removed)":                         [stem_tokens, remove_stop_words],
    "Lemmatized":                                         [lemmatize_tokens],
    "Lemmatized (stopword removed)":                      [lemmatize_tokens, remove_stop_words],
    "Spelled":                                            [spell_tokens],
    "Spelled (stopword removed)":                         [spell_tokens, remove_stop_words],
    "Spelled and lemmatized":                             [spell_tokens, lemmatize_tokens],
    "Spelled and lemmatized (stopword removed)":          [spell_tokens, lemmatize_tokens, remove_stop_words],
    "Spelled and stemmed":                                [spell_tokens, stem_tokens],
    "Spelled and stemmed (stopword removed)":             [spell_tokens, stem_tokens, remove_stop_words],
    "Spelled, stemmed and lemmatized":                    [spell_tokens, stem_tokens, lemmatize_tokens],
    "Spelled, stemmed and lemmatized (stopword removed)": [spell_tokens, stem_tokens, lemmatize_tokens, remove_stop_words],
}

CPU times: user 18 µs, sys: 1 µs, total: 19 µs
Wall time: 24.1 µs


### Preprocessing example

In [67]:
tweet = df.iloc[1]
print(f'Raw tweet (type {tweet["type"]}): {tweet["text"]}')

for name, methods in preprocessing_methods.items():
    tokenizer = get_tokenizer(methods)
    data = tokenizer(tweet['text'])
    print(f'{name}: {data}')

Raw tweet (type -1): talking to my over driver about where im goinghe said hed love to go to new york too but since trump its probably not
Just tokenized: ['talking', 'to', 'my', 'over', 'driver', 'about', 'where', 'im', 'goinghe', 'said', 'hed', 'love', 'to', 'go', 'to', 'new', 'york', 'too', 'but', 'since', 'trump', 'its', 'probably', 'not']
Stemmed: ['talk', 'to', 'my', 'over', 'driver', 'about', 'where', 'im', 'goingh', 'said', 'hed', 'love', 'to', 'go', 'to', 'new', 'york', 'too', 'but', 'sinc', 'trump', 'it', 'probabl', 'not']
Stemmed (stopword removed): ['talk', 'driver', 'im', 'goingh', 'said', 'hed', 'love', 'go', 'new', 'york', 'sinc', 'trump', 'probabl']
Lemmatized: ['talking', 'to', 'my', 'over', 'driver', 'about', 'where', 'im', 'goinghe', 'said', 'hed', 'love', 'to', 'go', 'to', 'new', 'york', 'too', 'but', 'since', 'trump', 'it', 'probably', 'not']
Lemmatized (stopword removed): ['talking', 'driver', 'im', 'goinghe', 'said', 'hed', 'love', 'go', 'new', 'york', 'since', '

### Data preprocessing ////todo delete

In [None]:
%%time
preprocessed_data = dict()

for name, methods in preprocessing_methods.items():
    tokenizer = get_tokenizer(methods)
    preprocessed_data[name] = [tokenizer(tweet) for tweet in train_df['text']]

## 10 similar pairs of tweets

In [84]:
%%time
prepared_vectorizers_transforms = dict()

for name, methods in preprocessing_methods.items():
    tokenizer = get_tokenizer(methods)
    vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer(x))
    prepared_vectorizers_transforms[name] = vectorizer.fit_transform(df['text'])

CPU times: user 14min 39s, sys: 1.69 s, total: 14min 41s
Wall time: 14min 46s


In [101]:
%%time

def find_top_similarity_tweets(similarity_tweets_matrix):
    N = 10
    i = 0

    similarity_index_array = np.zeros(shape=(len(similarity_tweets_matrix)), dtype=np.int16)
    similarity_value_array = np.zeros(shape=(len(similarity_tweets_matrix)))

    while i < len(similarity_tweets_matrix):
        array = np.array(similarity_tweets_matrix[i])
        array[i] = 0.0
        array[array >= 1.0] = 0.0
        max_index = np.argmax(array)
        similarity_index_array[i] = max_index
        similarity_value_array[i] = array[max_index]
        i += 1

    top_indexes = np.argpartition(similarity_value_array, -N)[-N:]

    for index in top_indexes:
        print(similarity_value_array[index])
        print(df.iloc[index]['text'])
        print(df.iloc[similarity_index_array[index]]['text'])
        print('')


for name, transform in prepared_vectorizers_transforms.items():
    print(f'{name}\n')
    similarity_matrix = cosine_similarity(transform)
    find_top_similarity_tweets(similarity_matrix)
    print(f'{OUTPUT_SEPARATOR}\n')

Just tokenized

0.9549145190462752
share the love thanks for being top new followers this week happy want this
share the love thanks for being top new followers this week happy want it

0.9801880845327963
hey thanks for being top new followers this week much appreciated happy want this
hey thanks for being my top new followers this week much appreciated happy want this

0.9626966357693176
and more also in the epaper
and more also in epaper

0.961753097518891
tamil nadu
in tamil nadu

0.9805362530533835
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for cont

0.9944618246693316
hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for cont
hi we tried to call your number but got no response unhappy please share another s

### Choose best preprocessing method

In [53]:
def choose_best_preprocessing_method(methods_dict):
    best_method_name = ''
    best_vectorizer = ''
    best_accuracy = 0
    for name, methods in methods_dict.items():
        tokenizer = get_tokenizer(methods)

        bin_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer(x), binary=True)
        count_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer(x))
        tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer(x))

        model = RandomForestClassifier(
            class_weight='balanced',
            criterion='entropy',
            max_depth=170,
            n_estimators=200,
            random_state=42
        )

        grid_pipline_bin_vec = Pipeline([
            ('vectorizer', bin_vectorizer),
            ('model', model),
        ])

        grid_pipline_count_vec = Pipeline([
            ('vectorizer', count_vectorizer),
            ('model', model),
        ])

        grid_pipline_tfidf_vec = Pipeline([
            ('vectorizer', tfidf_vectorizer),
            ('model', model),
        ])

        grid_pipline_bin_vec.fit(train_df['text'], train_df['type'])
        grid_pipline_count_vec.fit(train_df['text'], train_df['type'])
        grid_pipline_tfidf_vec.fit(train_df['text'], train_df['type'])

        y_pred_bin_vect = grid_pipline_bin_vec.predict(test_df['text'])
        y_pred_count_vect = grid_pipline_count_vec.predict(test_df['text'])
        y_pred_tfidf_vect = grid_pipline_tfidf_vec.predict(test_df['text'])
        y_true = test_df['type']

        accuracy_bin_vec = accuracy_score(y_true, y_pred_bin_vect)
        accuracy_count_vec = accuracy_score(y_true, y_pred_count_vect)
        accuracy_tfidf_vec = accuracy_score(y_true, y_pred_tfidf_vect)

        current_best_accuracy = 0
        current_best_vectorizer = ''
        if accuracy_bin_vec > current_best_accuracy:
            current_best_accuracy = accuracy_bin_vec
            current_best_vectorizer = 'bin vectorizer'
        elif accuracy_count_vec > current_best_accuracy:
            current_best_accuracy = accuracy_count_vec
            current_best_vectorizer = 'count vectorizer'
        elif accuracy_tfidf_vec > current_best_accuracy:
            current_best_accuracy = accuracy_tfidf_vec
            current_best_vectorizer = 'tfidf vectorizer'

        print(f'Preprocessing methods: {name}')
        print(f'accuracy bin vectorizer: {accuracy_bin_vec}')
        print(f'accuracy count vectorizer: {accuracy_count_vec}')
        print(f'accuracy tfidf vectorizer: {accuracy_tfidf_vec}\n')

        if current_best_accuracy > best_accuracy:
            best_accuracy = current_best_accuracy
            best_method_name = name
            best_vectorizer = current_best_vectorizer

    return {'name': best_method_name, 'vectorizer': best_vectorizer, 'accuracy': best_accuracy}

In [54]:
%%time
best_method = choose_best_preprocessing_method(preprocessing_methods)

print(f'Best preprocessing method: {best_method}')
print(OUTPUT_SEPARATOR)

Preprocessing methods: Just tokenized
accuracy bin vectorizer: 0.8851612903225806
accuracy count vectorizer: 0.8851612903225806
accuracy tfidf vectorizer: 0.8941935483870967

Preprocessing methods: Stemmed
accuracy bin vectorizer: 0.8903225806451613
accuracy count vectorizer: 0.8903225806451613
accuracy tfidf vectorizer: 0.895483870967742

Preprocessing methods: Stemmed (stopword removed)
accuracy bin vectorizer: 0.8761290322580645
accuracy count vectorizer: 0.8761290322580645
accuracy tfidf vectorizer: 0.8877419354838709

Preprocessing methods: Lemmatized
accuracy bin vectorizer: 0.8890322580645161
accuracy count vectorizer: 0.8890322580645161
accuracy tfidf vectorizer: 0.8941935483870967

Preprocessing methods: Lemmatized (stopword removed)
accuracy bin vectorizer: 0.8838709677419355
accuracy count vectorizer: 0.8838709677419355
accuracy tfidf vectorizer: 0.8877419354838709

Preprocessing methods: Spelled
accuracy bin vectorizer: 0.8916129032258064
accuracy count vectorizer: 0.891612

## Get best params

In [123]:
%%time

param_grid = {
    'max_depth': [1, 10, 30, 50, 70, 100, 130, 150, 170, 190, 230],
    'n_estimators': (5, 10, 20, 40, 60, 100, 150, 200, 300),
    'criterion': ('entropy', 'gini'),
    'class_weight': ['balanced', None]
}

methods = preprocessing_methods['Spelled and stemmed']
tokenizer = get_tokenizer(methods)

bin_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer(x), binary=True)
count_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer(x))
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer(x))

grid_pipline_model = GridSearchCV(
    RandomForestClassifier(random_state=0),
    param_grid=param_grid,
    cv=4,
    verbose=2,
    scoring='accuracy',
    n_jobs=-1
)

grid_pipline_bin_vectorizer = Pipeline([
    ('vectorizer', bin_vectorizer),
    ('model', grid_pipline_model),
])

grid_pipline_count_vectorizer = Pipeline([
    ('vectorizer', count_vectorizer),
    ('model', grid_pipline_model),
])

grid_pipline_tfidf_vectorizer = Pipeline([
    ('vectorizer', tfidf_vectorizer),
    ('model', grid_pipline_model),
])

CPU times: user 380 µs, sys: 2 µs, total: 382 µs
Wall time: 389 µs


In [124]:
%%time

grid_pipline_bin_vectorizer.fit(train_df['text'], train_df['type'])
grid_pipline_count_vectorizer.fit(train_df['text'], train_df['type'])
grid_pipline_tfidf_vectorizer.fit(train_df['text'], train_df['type'])

best_params_bin_vectorizer = grid_pipline_bin_vectorizer['model'].best_params_
best_params_count_vectorizer = grid_pipline_count_vectorizer['model'].best_params_
best_params_tfidf_vectorizer = grid_pipline_tfidf_vectorizer['model'].best_params_

y_pred_bin_vectorizer = grid_pipline_bin_vectorizer.predict(test_df['text'])
y_pred_count_vectorizer = grid_pipline_count_vectorizer.predict(test_df['text'])
y_pred_tfidf_vectorizer = grid_pipline_tfidf_vectorizer.predict(test_df['text'])
y_true = test_df['type']

accuracy_bin_vectorizer = accuracy_score(y_true, y_pred_bin_vectorizer)
accuracy_count_vectorizer = accuracy_score(y_true, y_pred_count_vectorizer)
accuracy_tfidf_vectorizer = accuracy_score(y_true, y_pred_tfidf_vectorizer)

print('Bin vectorizer:')
print(f'best params: {best_params_bin_vectorizer}')
print(f'accuracy: {accuracy_bin_vectorizer}')
print('Count vectorizer:')
print(f'best params: {best_params_count_vectorizer}')
print(f'accuracy: {accuracy_count_vectorizer}')
print('TFIDF vectorizer:')
print(f'best params: {best_params_tfidf_vectorizer}')
print(f'accuracy: {accuracy_tfidf_vectorizer}')
print(OUTPUT_SEPARATOR)

Fitting 4 folds for each of 396 candidates, totalling 1584 fits
Fitting 4 folds for each of 396 candidates, totalling 1584 fits
Fitting 4 folds for each of 396 candidates, totalling 1584 fits
Bin vectorizer:
best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 190, 'n_estimators': 200}
accuracy: 0.9045161290322581
Count vectorizer:
best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 190, 'n_estimators': 200}
accuracy: 0.9045161290322581
TFIDF vectorizer:
best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 190, 'n_estimators': 200}
accuracy: 0.9006451612903226
ʕ•́ᴥ•̀ʔっ♡ ¯\_(ツ)_/¯ (─‿‿─) 〈( ^.^)ノ ʕ•́ᴥ•̀ʔっ♡ ¯\_(ツ)_/¯ (─‿‿─) 〈( ^.^)ノ ʕ•́ᴥ•̀ʔっ♡
CPU times: user 7min 26s, sys: 4.42 s, total: 7min 31s
Wall time: 33min 40s


In [127]:
%%time

param_grid = {
    'max_depth': list(range(180, 200)),
    'n_estimators': list(range(190, 210)),
}

bin_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer(x), binary=True)
count_vectorizer = CountVectorizer(tokenizer=lambda x: tokenizer(x))
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer(x))

grid_pipline_model = GridSearchCV(
    RandomForestClassifier(random_state=0, class_weight='balanced'),
    param_grid=param_grid,
    cv=4,
    verbose=2,
    scoring='accuracy',
    n_jobs=-1
)

grid_pipline_bin_vectorizer = Pipeline([
    ('vectorizer', bin_vectorizer),
    ('model', grid_pipline_model),
])

grid_pipline_count_vectorizer = Pipeline([
    ('vectorizer', count_vectorizer),
    ('model', grid_pipline_model),
])

grid_pipline_tfidf_vectorizer = Pipeline([
    ('vectorizer', tfidf_vectorizer),
    ('model', grid_pipline_model),
])

CPU times: user 225 µs, sys: 10 µs, total: 235 µs
Wall time: 241 µs


In [128]:
%%time

grid_pipline_bin_vectorizer.fit(train_df['text'], train_df['type'])
grid_pipline_count_vectorizer.fit(train_df['text'], train_df['type'])
grid_pipline_tfidf_vectorizer.fit(train_df['text'], train_df['type'])

best_params_bin_vectorizer = grid_pipline_bin_vectorizer['model'].best_params_
best_params_count_vectorizer = grid_pipline_count_vectorizer['model'].best_params_
best_params_tfidf_vectorizer = grid_pipline_tfidf_vectorizer['model'].best_params_

y_pred_bin_vectorizer = grid_pipline_bin_vectorizer.predict(test_df['text'])
y_pred_count_vectorizer = grid_pipline_count_vectorizer.predict(test_df['text'])
y_pred_tfidf_vectorizer = grid_pipline_tfidf_vectorizer.predict(test_df['text'])
y_true = test_df['type']

accuracy_bin_vectorizer = accuracy_score(y_true, y_pred_bin_vectorizer)
accuracy_count_vectorizer = accuracy_score(y_true, y_pred_count_vectorizer)
accuracy_tfidf_vectorizer = accuracy_score(y_true, y_pred_tfidf_vectorizer)

print('Bin vectorizer:')
print(f'best params: {best_params_bin_vectorizer}')
print(f'accuracy: {accuracy_bin_vectorizer}')
print('Count vectorizer:')
print(f'best params: {best_params_count_vectorizer}')
print(f'accuracy: {accuracy_count_vectorizer}')
print('TFIDF vectorizer:')
print(f'best params: {best_params_tfidf_vectorizer}')
print(f'accuracy: {accuracy_tfidf_vectorizer}')
print(OUTPUT_SEPARATOR)

Fitting 4 folds for each of 400 candidates, totalling 1600 fits
Fitting 4 folds for each of 400 candidates, totalling 1600 fits
Fitting 4 folds for each of 400 candidates, totalling 1600 fits
Bin vectorizer:
best params: {'max_depth': 189, 'n_estimators': 195}
accuracy: 0.9006451612903226
Count vectorizer:
best params: {'max_depth': 189, 'n_estimators': 195}
accuracy: 0.9006451612903226
TFIDF vectorizer:
best params: {'max_depth': 189, 'n_estimators': 195}
accuracy: 0.9019354838709678
ʕ•́ᴥ•̀ʔっ♡ ¯\_(ツ)_/¯ (─‿‿─) 〈( ^.^)ノ ʕ•́ᴥ•̀ʔっ♡ ¯\_(ツ)_/¯ (─‿‿─) 〈( ^.^)ノ ʕ•́ᴥ•̀ʔっ♡
CPU times: user 6min 58s, sys: 4.34 s, total: 7min 2s
Wall time: 1h 6min 33s


### Different classification approaches

In [102]:
methods = preprocessing_methods['Spelled and stemmed']
tokenizer = get_tokenizer(methods)

#### Random forest with best params

In [103]:
%%time

vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer(x))
model = RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=189, n_estimators=195)

pipline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model),
])

pipline.fit(train_df['text'], train_df['type'])

CPU times: user 2min 50s, sys: 710 ms, total: 2min 51s
Wall time: 2min 52s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x7fe96c9415e0>)),
                ('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=189,
                                        n_estimators=195, random_state=0))])

In [104]:
%%time

y_true = test_df['type']
y_pred = pipline.predict(test_df['text'])
accuracy = accuracy_score(y_true, y_pred)

print(f'accuracy: {accuracy}')

accuracy: 0.8801169590643275
CPU times: user 27.4 s, sys: 46.5 ms, total: 27.4 s
Wall time: 27.5 s


#### CatBoost

In [105]:
%%time

vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenizer(x))
model = CatBoostClassifier()

pipline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model),
])

pipline.fit(train_df['text'], train_df['type'], model__silent=True)

CPU times: user 4min 30s, sys: 5.69 s, total: 4min 35s
Wall time: 1min 37s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x7fe955b604c0>)),
                ('model',
                 <catboost.core.CatBoostClassifier object at 0x7fe978583d30>)])

In [106]:
%%time

y_true = test_df['type']
y_pred = pipline.predict(test_df['text'])
accuracy = accuracy_score(y_true, y_pred)

print(f'accuracy: {accuracy}')

accuracy: 0.8611111111111112
CPU times: user 27.1 s, sys: 43.6 ms, total: 27.1 s
Wall time: 27.1 s


In [None]:
xtr, ytr, xte, yte = train_test_split(w2c)
model = RandomForestClassifier(**best_params)
model.fit(xtr, ytr)

print("accuracy:", accuracy_score(model.predict(xte), yte))

In [None]:
model = CatBoostClassifier()
model.fit(xtr, ytr, silent=True)

print("accuracy:", accuracy_score(model.predict(xte), yte))