
## Prerequisites

gensim==3.6.0

In [1]:
import os

from ast import literal_eval

from string import punctuation

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec, KeyedVectors
from gensim.models import FastText

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split


lemmatizer = WordNetLemmatizer() 
stop_words = set(stopwords.words('english'))

In [2]:
import numexpr as ne

ne.set_num_threads(ne.detect_number_of_cores())

4

In [3]:
df = pd.read_csv("../jigsaw-toxic-comment-classification-challenge/train.csv")

In [4]:
def preprocess_text(tokenizer, lemmatizer, stop_words, punctuation, text): 
    tokens = tokenizer(text.lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return [token for token in lemmas if token not in stop_words and token not in punctuation]

In [5]:
bool_load = True

if not bool_load:
    df['cleaned'] = df.comment_text.apply(lambda x: preprocess_text(word_tokenize, lemmatizer, stop_words, punctuation, x))

In [6]:
bool_save = False

if bool_save:
    df.to_csv("../jigsaw-toxic-comment-classification-challenge/train.csv")

In [7]:
df_sample = df.sample(100000)

In [8]:
df_sample.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned
139960,139960,ed0b00459b116c89,"""\nMaybe before you start making changes of yo...",0,0,0,0,0,0,"['``', 'maybe', 'start', 'making', 'change', '..."
112026,112026,5766d8a3f2b5bb97,:Rob rachlin\nPlease do not make personal atta...,0,0,0,0,0,0,"['rob', 'rachlin', 'please', 'make', 'personal..."
59726,59726,9feb4d19f9e237fe,Help desk is closed for me. Who made this?,0,0,0,0,0,0,"['help', 'desk', 'closed', 'made']"
83678,83678,dff003d90c36b7e6,"""\nAnd your sources in Serbian which you use t...",0,0,0,0,0,0,"['``', 'source', 'serbian', 'use', 'article', ..."
130464,130464,b9eb04bfc01ac175,"A victory means the enemy was destroyed, in th...",0,0,0,0,0,0,"['victory', 'mean', 'enemy', 'wa', 'destroyed'..."


### Train the model from scratch

Train our first model based on the vocabulary from df_sample: 

In [9]:
# With initialization model trained for 5 epochs 

df_sample_cleaned_list = [literal_eval(s) for s in df_sample.cleaned.tolist()]

model = Word2Vec(sentences=df_sample_cleaned_list, 
         size=100,      # embedding vector size
         min_count=5,   # consider words that occured at least 5 times
         window=5)

In [10]:
# Continue training the model 

model.train(sentences=df_sample_cleaned_list, 
            total_examples=model.corpus_count,
            epochs=30
           )

(99866609, 118158270)

In [11]:
#model.wv.vocab # to look at vocabulary

In [12]:
model.wv.most_similar('people')

[('others', 0.6589839458465576),
 ('thing', 0.5865795016288757),
 ('person', 0.5735608339309692),
 ('editor', 0.5502363443374634),
 ('admins', 0.5481014847755432),
 ('everyone', 0.5176295042037964),
 ('really', 0.515514612197876),
 ('someone', 0.5116820931434631),
 ('guy', 0.5001378655433655),
 ('way', 0.4992937445640564)]

### The next approach is to try to use the already pretrained model, which can be downloaded from here:

https://github.com/RaRe-Technologies/gensim-data

model:   
GoogleNews-vectors-negative300.bin

In [13]:
#os.getcwd()

In [14]:
model = KeyedVectors.load_word2vec_format(
    os.getcwd() + os.sep + "GoogleNews-vectors-negative300.bin", binary=True
)

In [15]:
# You can try to use GloVe model too and experiment with it: <- later
# import gensim.downloader as api
# model = api.load('glove-wiki-gigaword-100')

## Words distance 

# 1 - Cosine similarity

To measure how similar two words are, we need a way to measure the degree of similarity between two embedding vectors for the two words. Given two vectors $u$ and $v$, cosine similarity is defined as follows: 

$$\text{CosineSimilarity(u, v)} = \frac {u . v} {||u||_2 ||v||_2} = cos(\theta) \tag{1}$$

where $u.v$ is the dot product (or inner product) of two vectors, $||u||_2$ is the norm (or length) of the vector $u$, and $\theta$ is the angle between $u$ and $v$. This similarity depends on the angle between $u$ and $v$. If $u$ and $v$ are very similar, their cosine similarity will be close to 1; if they are dissimilar, the cosine similarity will take a smaller value. 

<img src="cosine_sim.png" style="width:800px;height:250px;">
<caption><center> **Figure 1**: The cosine of the angle between two vectors is a measure of how similar they are</center></caption>

**Exercise**: Implement the function `cosine_similarity()` to evaluate similarity between word vectors.

**Reminder**: The norm of $u$ is defined as $ ||u||_2 = \sqrt{\sum_{i=1}^{n} u_i^2}$

In [16]:
def cosine_similarity(w1, w2):
    """
    Cosine similarity between w1 and w2
    
    Arguments:
        w1 : word vector        
        w2 : word vector 
    Returns:
        cosine_similarity 
    """
    if (not np.any(w1) or not np.any(w2)): # check input is not zero-vector
        return 0
    
    # Dot product between w1 and w2
    dot = np.dot(w1, w2)
    # L2 norm of w1
    norm_u = np.linalg.norm(w1) 
    # L2 norm of w2 
    norm_v = np.linalg.norm(w2) 
    # Cosine similarity 
    cosine_similarity = dot / (norm_u * norm_v)
    
    return cosine_similarity

In [17]:
father = model.get_vector("father")
mother = model.get_vector("mother")

ball = model.get_vector("ball")
crocodile = model.get_vector("crocodile")

france = model.get_vector("france")
paris = model.get_vector("paris")
italy = model.get_vector("italy")
rome = model.get_vector("rome")

kiev = model.get_vector("kiev")
ukraine = model.get_vector("ukraine")

In [18]:
fast_print = lambda u, v, tag1, tag2: print(
    "cosine_similarity({t1}, {t2}) = ".format(t1 = tag1, t2 = tag2), cosine_similarity(u, v)
)

fast_print(father, mother, "father", "mother")
fast_print(ball, crocodile, "ball", "crocodile")
fast_print(france - paris, rome - italy, "france - paris", "rome - italy")
fast_print(kiev, ukraine, "kiev", "ukraine")

cosine_similarity(father, mother) =  0.79014826
cosine_similarity(ball, crocodile) =  0.10283584
cosine_similarity(france - paris, rome - italy) =  -0.1988747
cosine_similarity(kiev, ukraine) =  0.3738725


**Approximate expected output**:

<table>
    <tr>
        <td>
            **cosine_similarity(father, mother)** =
        </td>
        <td>
         0.79014826
        </td>
    </tr>
        <tr>
        <td>
            **cosine_similarity(ball, crocodile)** =
        </td>
        <td>
         0.10283585
        </td>
    </tr>
        <tr>
        <td>
            **cosine_similarity(france - paris, rome - italy)** =
        </td>
        <td>
         -0.421037
        </td>
    </tr>
</table>

## 2 - Word analogy task

In the word analogy task, we complete the sentence <font color='brown'>"*a* is to *b* as *c* is to **____**"</font>. An example is <font color='brown'> '*man* is to *woman* as *king* is to *queen*' </font>. In detail, we are trying to find a word *d*, such that the associated word vectors $e_a, e_b, e_c, e_d$ are related in the following manner: $e_b - e_a \approx e_d - e_c$. We will measure the similarity between $e_b - e_a$ and $e_d - e_c$ using cosine similarity. 

**Exercise**: Complete the code below to be able to perform word analogies!

***Note***: here you will need to complete a function in the sections, which are marked as:

```
# ----- Start ----- #
Your code should be written in-between the lines
# ------ End ------ #
```


In [19]:
def find_word_analogy(word_1, word_2, word_3, model):
    """
    Finds the word to complete analogy (see explanation above): a is to b as c is to ____. 
    
    Arguments:
    word_1 -- a word, string
    word_2 -- a word, string
    word_3 -- a word, string
    model -- word embeddings model 
    
    Returns:
    best_word --  the word such that v_1 - v_2 is close to v_best_word - v_3, as measured by cosine similarity
    """
    # convert words to lower case
    word_1, word_2, word_3 = word_1.lower(), word_2.lower(), word_3.lower()
    
    # ----- Start ----- #
    # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
    fast_get = lambda word: model.get_vector(word)
    e_1, e_2, e_3 = tuple(map(fast_get, [word_1, word_2, word_3]))
    # ------ End ------ #
    
    words = list(model.vocab.keys())
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None

    # Loop over the whole word vector set
    for w in words:        
        e_j = fast_get(w)
        # to avoid best_word being one of the input words, skip them and continue iteration.
        if w in [word_1, word_2, word_3]:
            continue
        
        # ----- Start ----- #
        # Compute cosine similarity between the vector (e_2 - e_1) and the vector ((w's vector) - e_3)
        cosine_sim = cosine_similarity(e_2 - e_1, e_j - e_3)
        
        # If the cosine_sim is more than the max_cosine_sim seen so far,
        # do not forget to set new max_cosine_sim to the current value and best_word as well
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        # ------ End ------ #
        
    return best_word

In [20]:
triads_to_try = [
    ('man', 'woman', 'king'), 
    ('bad', 'good', 'sad'), 
    ('man', 'woman', 'boy'), 
    ('small', 'smaller', 'large')
]

for triad in triads_to_try:
    print('{} -> {} :: {} -> {}'.format(*triad, find_word_analogy(*triad, model)))

man -> woman :: king -> queen
bad -> good :: sad -> wonderful
man -> woman :: boy -> girl
small -> smaller :: large -> larger


**Expected Output**:

<table>
    <tr>
        <td>
            **man -> woman** ::
        </td>
        <td>
         king -> queen
        </td>
    </tr>
        <tr>
        <td>
            **bad -> good** ::
        </td>
        <td>
         sad -> wonderful
        </td>
    </tr>
        <tr>
        <td>
            **man -> woman ** ::
        </td>
        <td>
         boy -> girl
        </td>
    </tr>
        <tr>
        <td>
            **small -> smaller ** ::
        </td>
        <td>
         large -> larger
        </td>
    </tr>
</table>

#### The next part of the task is to:  

1. Train your own W2V model using the proposed method above. Use all of the tokens created after your preprocessing pipeline in the previous tasks. (deleting stop_words, punctuation, lowercasing, etc - play as you want).  
2. Use obtained vectors to obtain text vectors using such pipeline: 
  1. For each word in a preprocessed text, get a word vector from the W2V model. 
  2. Add them together to obtain vectors for texts (sum them together, or get mean vector) 
3. Use obtained text vectors as a text representation to perform a text classification task.  
   Proposed - use binary classification (for example: select only 'obscene' text and clean and try to distinguish them one from another)
4. Calculate the metrics - TP, FP, FN, TN, precision, recall, F1 score, F2 score, accurary. 


In [21]:
from gensim.models.callbacks import CallbackAny2Vec


class callback_custom(CallbackAny2Vec):
    def __init__(self):
         self.epoch = 0

    def on_epoch_end(self, model):
        print("Iteration {:3}".format(self.epoch+1))
        self.epoch += 1

In [22]:
# init w2v model
n_dimensions = 300

model_w2v = Word2Vec(sentences=df_sample_cleaned_list, 
                     size=n_dimensions, min_count=5, window=5,
                     callbacks=[callback_custom()]
                    )

Iteration   1
Iteration   2
Iteration   3
Iteration   4
Iteration   5


In [23]:
# model training
number_of_iterations = 50

model_w2v.train(sentences=df_sample_cleaned_list, 
            total_examples=model_w2v.corpus_count,
            epochs=number_of_iterations
           )

Iteration   6
Iteration   7
Iteration   8
Iteration   9
Iteration  10
Iteration  11
Iteration  12
Iteration  13
Iteration  14
Iteration  15
Iteration  16
Iteration  17
Iteration  18
Iteration  19
Iteration  20
Iteration  21
Iteration  22
Iteration  23
Iteration  24
Iteration  25
Iteration  26
Iteration  27
Iteration  28
Iteration  29
Iteration  30
Iteration  31
Iteration  32
Iteration  33
Iteration  34
Iteration  35
Iteration  36
Iteration  37
Iteration  38
Iteration  39
Iteration  40
Iteration  41
Iteration  42
Iteration  43
Iteration  44
Iteration  45
Iteration  46
Iteration  47
Iteration  48
Iteration  49
Iteration  50
Iteration  51
Iteration  52
Iteration  53
Iteration  54
Iteration  55


(166440075, 196930450)

In [24]:
#model_w2v.wv.vocab

In [25]:
model_w2v.wv.most_similar('people')

[('others', 0.4841853976249695),
 ('thing', 0.4806921184062958),
 ("n't", 0.46461012959480286),
 ('way', 0.46231457591056824),
 ('editor', 0.4622182548046112),
 ('person', 0.45410239696502686),
 ('admins', 0.4445332884788513),
 ('really', 0.4407009482383728),
 ('would', 0.4344203472137451),
 ("'re", 0.4250907301902771)]

In [26]:
model_w2v.wv.most_similar('one')

[('two', 0.5056474208831787),
 ("n't", 0.47986412048339844),
 ('way', 0.47343599796295166),
 ("'s", 0.4682750105857849),
 ('article', 0.4649146795272827),
 ('even', 0.45324623584747314),
 ('many', 0.4527236521244049),
 ('also', 0.4462509751319885),
 ('thing', 0.4392848610877991),
 ('think', 0.43367040157318115)]

In [27]:
bool_save_model = False

if bool_save_model:
    model_w2v.wv.save_word2vec_format('w2v_df_t2_clnd_sample.bin', binary = True)

In [28]:
model_w2v_vectors = model_w2v.wv # getting keyed vectors from trained model

In [60]:
# building text vectors
def form_text_vector(words_from_text, w2v_model_keyed_vectors, num_dim, tfidf_matr = None, idx_doc = None, vocab = None):
    text_vectorized = np.zeros(num_dim)
    for k in range(len(words_from_text)):
        try:
            q = tfidf_matr[
                idx_doc, vocab.index(words_from_text[k])
            ] if tfidf_matr is not None else 1
            v = w2v_model_keyed_vectors[words_from_text[k]]
            #v = w2v_model_keyed_vectors.get_vector(words_from_text[k])
            text_vectorized = ne.evaluate('text_vectorized + q * v') 
        except (KeyError, ValueError):
            continue
    return text_vectorized


def form_corpus_matrix(corpus, w2v_model_keyed_vectors, num_dim, weightened = False):
    '''
    corpus -> list of lists of strings
    w2v_model_keyed_vectors -> word2vec.wv
    num_dim -> dimension of word2vec vectors
    weightened -> use tf-idf weighting on vector components or not
    '''
    fast_vocab = lambda word_model: list(word_model.wv.vocab.keys())
    fast_concat = lambda text_data: [' '.join(e) for e in text_data]
    corpus_len = len(corpus)
    corpus_vectorized = np.empty((corpus_len, num_dim))
    tfidf = None if not weightened else TfidfVectorizer(
        vocabulary = fast_vocab(w2v_model_keyed_vectors)
    ).fit_transform(fast_concat(corpus))
    for j in range(corpus_len):
        corpus_vectorized[j] = form_text_vector(
            corpus[j], w2v_model_keyed_vectors, num_dim, tfidf, j, fast_vocab(w2v_model_keyed_vectors)
        )
    return corpus_vectorized

In [31]:
text_categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'cleaned']

temp = df[[text_categories[4], text_categories[-1]]]

temp_n = temp[~df[text_categories[:-1]].any(axis = 'columns')]
temp_i = temp[df.insult != 0]

insulting_and_neutral = temp_i.append(temp_n).reset_index(drop = True)
insulting_and_neutral.columns = ['label', 'texts']

del temp, temp_n, temp_i

print(
    insulting_and_neutral.head(),
    insulting_and_neutral.tail(),
    sep = '\n\n'
)

   label                                              texts
0      1           ['cocksucker', 'piss', 'around', 'work']
1      1  ['gay', 'antisemmitian', 'archangel', 'white',...
2      1                ['fuck', 'filthy', 'mother', 'dry']
3      1  ['stupid', 'peace', 'shit', 'stop', 'deleting'...
4      1  ['=tony', 'sidaway', 'obviously', 'fistfuckee'...

        label                                              texts
151218      0  ['``', 'second', 'time', 'asking', 'view', 'co...
151219      0  ['ashamed', 'horrible', 'thing', 'put', 'talk'...
151220      0  ['spitzer', 'umm', 'actual', 'article', 'prost...
151221      0  ['look', 'like', 'wa', 'actually', 'put', 'spe...
151222      0  ['``', '...', 'really', "n't", 'think', 'under...


In [32]:
P = 0.25

X_train_t, X_test_t, Y_train, Y_test = train_test_split(
    insulting_and_neutral['texts'], insulting_and_neutral['label'],
    test_size = P,
    random_state = 1
)

X_train_t = [literal_eval(s) for s in X_train_t.reset_index(drop = True)]
X_test_t = [literal_eval(s) for s in X_test_t.reset_index(drop = True)]

Y_train = Y_train.reset_index(drop = True)
Y_test = Y_test.reset_index(drop = True)

In [33]:
#X_train_t[:4]

In [34]:
#X_test_t[:4]

In [49]:
def basic_report(y_test, y_prediction):
    confusion_matr = confusion_matrix(y_test, y_prediction)
    print("CONFUSION MATRIX:\n{matr}".format(matr=confusion_matr))
    accuracy_of_model = accuracy_score(y_test, y_prediction)
    print("ACCURACY:\n{acc}".format(acc = accuracy_of_model))
    sklearn_report = classification_report(y_test, y_prediction)
    print("TABLE:\n{tab}".format(tab=sklearn_report))
    return (confusion_matr, accuracy_of_model, sklearn_report)

# too slow, rewrite later (1)
def quick_init_and_train_word_cls_model_no_args(
    x_train_t, x_test_t, y_train, y_test, word_model, n_dim, cls_model, weightened = False):
    print('DATA PREPAIRING START')
    x_train = form_corpus_matrix(
        x_train_t, w2v_model_keyed_vectors = word_model, num_dim = n_dim, weightened = weightened
    ) # form_corpus_matrix(x_train_t, word_model, n_dim, weightened)
    x_test = form_corpus_matrix(
        x_test_t, w2v_model_keyed_vectors = word_model, num_dim = n_dim, weightened = weightened
    )# form_corpus_matrix(x_test_t, word_model, n_dim, weightened)
    print('MODEL TRAINING START')
    cls_m = cls_model()
    cls_m.fit(x_train, y_train)
    
    y_prediction = cls_m.predict(x_test)
    results = basic_report(y_test, y_prediction)
    
    return (cls_m, results)

In [50]:
# include classifier: RF
random_forest_cls_dat_1 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model_w2v_vectors, n_dimensions, RandomForestClassifier, False
)

DATA PREPAIRING START




MODEL TRAINING START
CONFUSION MATRIX:
[[35772    43]
 [ 1046   945]]
ACCURACY:
0.971195048405015
TABLE:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     35815
           1       0.96      0.47      0.63      1991

    accuracy                           0.97     37806
   macro avg       0.96      0.74      0.81     37806
weighted avg       0.97      0.97      0.97     37806



In [51]:
# include classifier: LR
logit_cls_dat_1 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model_w2v_vectors, n_dimensions, LogisticRegression, False
)

DATA PREPAIRING START




MODEL TRAINING START
CONFUSION MATRIX:
[[35645   170]
 [  631  1360]]
ACCURACY:
0.9788128868433582
TABLE:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     35815
           1       0.89      0.68      0.77      1991

    accuracy                           0.98     37806
   macro avg       0.94      0.84      0.88     37806
weighted avg       0.98      0.98      0.98     37806



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### well...

#### The second part of the task is: 

1. While performing a step 2 for text vectorization, for each word add its vector with tf-idf weight -> weighted average. 
2. Perform a same text classification task as it was required above. 
3. Calculate the metrics, compare with a vectorization approach without weightning. 

In [52]:
# include classifier: RF
random_forest_cls_dat_2 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model_w2v_vectors, n_dimensions, RandomForestClassifier, True
) # (1)



DATA PREPAIRING START




MODEL TRAINING START
CONFUSION MATRIX:
[[35777    38]
 [ 1041   950]]
ACCURACY:
0.9714595566841242
TABLE:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     35815
           1       0.96      0.48      0.64      1991

    accuracy                           0.97     37806
   macro avg       0.97      0.74      0.81     37806
weighted avg       0.97      0.97      0.97     37806



#### The third part of the task is: 

1. Use a pre-trained W2V model for obtaining a word vectors for each of the tokens in your dataset, create text vectors WITHOUT weightning. 
2. Train text classification model.
3. Calculate the metrics.

In [53]:
# include classifier: RF
random_forest_cls_dat_3 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model, n_dimensions, RandomForestClassifier
) 

DATA PREPAIRING START




KeyboardInterrupt: 

#### The fourth part of the task is: 

1. Use a pre-trained W2V model for obtaining a word vectors for each of the tokens in your dataset, create text vectors WITH tf-idf weightning. 
2. Train a text classification model. 
3. Calculate the metrics. 

In [None]:
# include classifier: RF
random_forest_cls_dat_4 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model, n_dimensions, RandomForestClassifier, True
)

### Visualizations part 

Use dimentionality reduction methods such as t-SNE or PCA to make your 300 dim vectors available for 2D plotting. 

Select top (10-20) words for each cathegory BY TF-IDF SCORE, not counts!!! 

Plot on the ONE plot all of this words but colors must be different for top-words for obscene cathegory, clean, toxic, etc... 

See, if words from one cathegory are closer to each other than to others. 
Or you observe ~2 clusters: all of the toxic words, clean words.  
Explain what you see and why. 


In [None]:
# as always, using PCA
model_pca = PCA(n_components = 5)

### Additional part: 

1. Find a pre-trained FastText vectors, understand it's difference from W2V vectors. 
2. Vectorize all of your texts using FT model, perform a text classification, calculate the metrics, compare with W2V approach. 

Or/And you can:

1. Train your own FT model and make the same. 
2. Compare it with previous approaches.

In [54]:
model_ft = FastText(sentences=df_sample_cleaned_list, 
                    size=300,
                    min_count=5,
                    window=5)

In [55]:
# model training
number_of_iterations = 50

model_ft.train(sentences=df_sample_cleaned_list, 
            total_examples=model_ft.corpus_count,
            epochs=number_of_iterations
           )

In [62]:
random_forest_cls_dat_5 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model_ft, n_dimensions, RandomForestClassifier, False
)

DATA PREPAIRING START


  if __name__ == '__main__':


MODEL TRAINING START
CONFUSION MATRIX:
[[35764    51]
 [ 1112   879]]
ACCURACY:
0.9692376871396075
TABLE:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     35815
           1       0.95      0.44      0.60      1991

    accuracy                           0.97     37806
   macro avg       0.96      0.72      0.79     37806
weighted avg       0.97      0.97      0.96     37806



In [63]:
random_forest_cls_dat_6 = quick_init_and_train_word_cls_model_no_args(
    X_train_t, X_test_t, Y_train, Y_test, model_ft, n_dimensions, RandomForestClassifier, True
)

DATA PREPAIRING START


  if __name__ == '__main__':
  if __name__ == '__main__':


MODEL TRAINING START
CONFUSION MATRIX:
[[35769    46]
 [ 1060   931]]
ACCURACY:
0.9707453843305296
TABLE:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     35815
           1       0.95      0.47      0.63      1991

    accuracy                           0.97     37806
   macro avg       0.96      0.73      0.81     37806
weighted avg       0.97      0.97      0.97     37806



In [125]:
from joblib import dump

# saving models into c binary files, measures into dataframe
def save_model_and_get_fancy_stats(cls_dat, cls_name):
    dump(cls_dat[0], os.getcwd() + os.sep + cls_name + '.joblib')
    cls_measures = list(map(float, cls_dat[1][2][70:100].split()))
    # horrifying :D 
    return pd.Series({
        'tp': cls_dat[1][0][0][0],
        'tn': cls_dat[1][0][1][1],
        'fn': cls_dat[1][0][1][0],
        'fp': cls_dat[1][0][0][1],
        'pre': cls_measures[0],
        'rec': cls_measures[1],
        'f1': cls_measures[2],
        'acc': cls_dat[1][1]
    }, name = cls_name)

In [126]:
df_stats = pd.DataFrame(columns=['tp', 'tn', 'fp', 'fn', 'pre', 'rec', 'f1', 'acc'])
#df_stats.set_index('model', inplace=True)

In [128]:
df_stats = df_stats.append(save_model_and_get_fancy_stats(random_forest_cls_dat_1, 'rf_w2v_trained'))
df_stats = df_stats.append(save_model_and_get_fancy_stats(random_forest_cls_dat_2, 'rf_w2v_trained_tfidf'))
# df_stats = df_stats.append(save_model_and_get_fancy_stats(random_forest_cls_dat_3, 'rf_w2v_pretrained', df_stats))
# df_stats = df_stats.append(save_model_and_get_fancy_stats(random_forest_cls_dat_4, 'rf_w2v_pretrained_tfidf', df_stats))
df_stats = df_stats.append(save_model_and_get_fancy_stats(random_forest_cls_dat_5, 'rf_ft_trained'))
df_stats = df_stats.append(save_model_and_get_fancy_stats(random_forest_cls_dat_6, 'rf_ft_trained_tfidf'))

In [129]:
df_stats

Unnamed: 0,tp,tn,fp,fn,pre,rec,f1,acc
rf_w2v_trained,35772.0,945.0,43.0,1046.0,0.97,1.0,0.99,0.971195
rf_w2v_trained_tfidf,35777.0,950.0,38.0,1041.0,0.97,1.0,0.99,0.97146
rf_ft_trained,35764.0,879.0,51.0,1112.0,0.97,1.0,0.98,0.969238
rf_ft_trained_tfidf,35769.0,931.0,46.0,1060.0,0.97,1.0,0.98,0.970745


In [130]:
df_stats.to_csv(os.getcwd() + os.sep + 'models_t4.csv')

### Conclusions: 

Please, provide a clear table or dataframe with all of the metrics for all of the trained/used models available.   

Compare them to each other.   

Make conclusions which one from your models worked better for this particular task.   
BE CAREFUL: Having a better model performance on this particular task does not matter that this model is better than others in GENERAL. You need to make your own conclusions about this particular model applied to this particular task. Please, think and understand WHY.   
Write your thoughts down below: 



In [None]:
### Your conclusions here.

In [None]:
### Your thoughts about the last question here. 