# Intro

We will try to fit a linear model like SVM, Naive Bayes or XG Boost after using Word2Vec to transform text to numbers

## Embeddings
Glove pre-trained embeddings were used. Downloaded from https://nlp.stanford.edu/projects/glove/ <br>
Other Glove pre-trained embeddings can be found at: https://github.com/RaRe-Technologies/gensim-data

# Imports

In [2]:
import os
import random

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

# Read Data

In [44]:
train_df = pd.read_csv('../data/toxic.csv', index_col=0)
train_df.head()

Unnamed: 0,text,score
0,Explanation\nWhy the edits made under my usern...,0.0
1,D'aww! He matches this background colour I'm s...,0.0
2,"Hey man, I'm really not trying to edit war. It...",0.0
3,"""\nMore\nI can't make any real suggestions on ...",0.0
4,"You, sir, are my hero. Any chance you remember...",0.0


# Load Embeddings

In [8]:
embedding_file_path = '../embeddings/glove.6B/glove.6B.300d.txt'

In [9]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        words_to_vec = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            words_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
    return words, words_to_vec

In [10]:
words, glove_words_to_vec = read_glove_vecs(embedding_file_path)

In [25]:
words = list(words)

In [26]:
words[:20]

['phyllosilicates',
 'eshmun',
 'baedeker',
 'www.cdc.gov',
 'bergessio',
 'mouhamadou',
 'zinga',
 'cannella',
 '25.78',
 'crookedly',
 'ambito',
 'iccf',
 'remirez',
 'lyse',
 'liliom',
 'sebes',
 '2,953',
 '49.9-percent',
 'jean-antoine',
 'gerrards']

# Try Gensim for Loading Embeddings

In [11]:
import gensim.downloader as api

# info = api.info()  # show info about available models/datasets
gensim_model = api.load('glove-wiki-gigaword-50')



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [22]:
model.most_similar('prime')

[('minister', 0.9074519872665405),
 ('cabinet', 0.8513240218162537),
 ('premier', 0.826135516166687),
 ('outgoing', 0.7996107935905457),
 ('ministers', 0.755952775478363),
 ('counterpart', 0.7476111650466919),
 ('leader', 0.7426784038543701),
 ('party', 0.736582338809967),
 ('parliament', 0.7328572273254395),
 ('designate', 0.7246195673942566)]

# Create Feature Vectors for Sentences

In [61]:
sample_text = train_df[train_df['score'] > 0]['text'].iloc[0]
sample_text = sample_text.lower()

In [62]:
sample_text.split()

['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work']

In [63]:
init_feature_vec = np.zeros((300), dtype='float32')
init_feature_vec

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [64]:
glove_words_to_vec[sample_text.split()[1]]

array([ 7.2964e-04,  1.0987e-01,  6.3975e-02,  2.6729e-02,  1.0066e-01,
        4.5820e-02, -1.7865e-01, -1.4656e-01,  2.2387e-01, -1.5851e+00,
       -7.5385e-02,  7.7855e-02,  7.1722e-02, -2.0296e-02, -2.1454e-01,
        2.9640e-01,  4.2858e-02, -2.4337e-01,  2.8042e-01,  1.7027e-01,
       -1.5320e-01, -9.7088e-02,  3.7339e-01, -4.5384e-02, -6.4616e-02,
       -1.6910e-01,  5.0648e-03, -1.9953e-01, -3.3578e-01,  8.2778e-02,
        1.7225e-01, -4.7207e-02,  3.6781e-01,  1.2379e-01, -1.3730e+00,
       -1.8022e-01, -3.8348e-01,  3.4067e-01, -2.3678e-01, -8.0015e-02,
        2.6686e-01, -4.7917e-02, -3.1266e-01,  2.3985e-01,  1.2477e-01,
        1.9809e-01,  1.9034e-01,  4.7338e-01, -1.7891e-01,  3.0567e-01,
       -2.3283e-01, -5.3673e-02, -1.7378e-01,  7.6577e-03, -2.0776e-01,
        1.9245e-01,  2.5776e-01, -6.9133e-02, -2.2684e-02, -3.5310e-02,
        9.9464e-02, -1.5368e-01,  5.3919e-02, -2.1117e-01,  4.7328e-02,
       -7.5565e-01,  2.5320e-01, -1.0467e-02,  2.9713e-01, -3.51

In [66]:
feature_vec = np.add(init_feature_vec, glove_words_to_vec[sample_text.split()[1]])
feature_vec

array([ 7.2964e-04,  1.0987e-01,  6.3975e-02,  2.6729e-02,  1.0066e-01,
        4.5820e-02, -1.7865e-01, -1.4656e-01,  2.2387e-01, -1.5851e+00,
       -7.5385e-02,  7.7855e-02,  7.1722e-02, -2.0296e-02, -2.1454e-01,
        2.9640e-01,  4.2858e-02, -2.4337e-01,  2.8042e-01,  1.7027e-01,
       -1.5320e-01, -9.7088e-02,  3.7339e-01, -4.5384e-02, -6.4616e-02,
       -1.6910e-01,  5.0648e-03, -1.9953e-01, -3.3578e-01,  8.2778e-02,
        1.7225e-01, -4.7207e-02,  3.6781e-01,  1.2379e-01, -1.3730e+00,
       -1.8022e-01, -3.8348e-01,  3.4067e-01, -2.3678e-01, -8.0015e-02,
        2.6686e-01, -4.7917e-02, -3.1266e-01,  2.3985e-01,  1.2477e-01,
        1.9809e-01,  1.9034e-01,  4.7338e-01, -1.7891e-01,  3.0567e-01,
       -2.3283e-01, -5.3673e-02, -1.7378e-01,  7.6577e-03, -2.0776e-01,
        1.9245e-01,  2.5776e-01, -6.9133e-02, -2.2684e-02, -3.5310e-02,
        9.9464e-02, -1.5368e-01,  5.3919e-02, -2.1117e-01,  4.7328e-02,
       -7.5565e-01,  2.5320e-01, -1.0467e-02,  2.9713e-01, -3.51

In [67]:
feature_vec.shape

(300,)

In [87]:
def get_feature_vectors(sentence):
    words = sentence.split()
    feature_vec = np.zeros((300,),dtype="float32")
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, glove_words_to_vec.get(word))
            print(f'encoded word: {word}; i: {i}')
        except:
            print('in except!!!')
            i = i + 1
    if len(words) > 0:
        print(f'len(words): {len(words)}; i: {i}')
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

In [91]:
f_vec = get_feature_vectors(sample_text)

in except!!!
encoded word: before; i: 1
encoded word: you; i: 1
encoded word: piss; i: 1
encoded word: around; i: 1
encoded word: on; i: 1
encoded word: my; i: 1
encoded word: work; i: 1
len(words): 8; i: 1


In [92]:
f_vec

array([-5.09341943e-02,  3.51459429e-02, -1.47929429e-01,  5.42057143e-03,
       -7.35508571e-02,  7.58444286e-02, -3.28551429e-01, -6.68957143e-02,
        7.41194286e-02, -1.53899857e+00,  7.61142857e-03,  9.78478571e-02,
        8.01286000e-02,  9.85192857e-02,  4.06360000e-02,  2.51745714e-01,
        2.57877143e-02,  3.90495714e-02,  4.09294286e-02,  2.16860000e-01,
        6.71151429e-02,  3.31014571e-01,  1.57311286e-01, -1.62820571e-01,
       -1.30622286e-01, -9.09980286e-02,  1.08532571e-02,  1.59544286e-02,
       -1.78637857e-01, -4.77992857e-02,  8.62398571e-02,  3.31917571e-01,
       -1.55592886e-01,  3.06028571e-02, -8.83598571e-01,  1.99185000e-01,
       -1.84443714e-01,  7.53415714e-02, -3.13841143e-01, -4.21823000e-02,
        6.13601429e-02, -3.41245714e-01, -1.34698571e-01, -9.15778571e-02,
        9.10742857e-03,  1.75936857e-01,  4.95288571e-01,  2.19244286e-01,
       -1.76753714e-01,  1.02673429e-01,  1.11892571e-01, -1.59198000e-01,
       -4.74037143e-02, -

In [90]:
f_vec.shape

(300,)