# Word Embeddings

## Importing libraries 

In [35]:
import itertools
import random
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from gensim.models import Word2Vec                                   #pip install gensim
from gensim.models.word2vec import Text8Corpus
import gensim.downloader as api
from glove import Corpus, Glove                                      #pip install glove-py
from joblib import dump, load
from gensim import utils

In [50]:
PATH_TRAIN_NEG = '../Resources/train_neg_processed.txt'
PATH_TRAIN_POS = '../Resources/train_pos_processed.txt'

with open(PATH_TRAIN_POS) as f:
    train_pos = f.read().splitlines()
with open(PATH_TRAIN_NEG) as f:
    train_neg = f.read().splitlines()

train_set = train_pos + train_neg

## Count Vectorizer 

In [25]:
"basic and naive method for word embedding"
def we_count_vectorize(train_set) :
    
    vectorizer = CountVectorizer(lowercase=True)
    text_counts = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_
    
    return voc, text_counts

In [26]:
"Sparse matrix : would require way too much space (around 38GB)"
voc, text_counts = we_count_vectorize(train_set)

## Tf-idf

In [74]:
vectorizer = TfidfVectorizer(use_idf=True)
tfIdf = vectorizer.fit_transform(train_set)
df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.tail(35))

                    TF-IDF
jaybe                  0.0
jaycee                 0.0
jayesslee              0.0
jawa                   0.0
jayhawk                0.0
jaylin                 0.0
jaylor                 0.0
jaypark                0.0
jazbat                 0.0
jbl                    0.0
jbos                   0.0
jcksnxkwnxlwncowkd     0.0
jawaad                 0.0
javosync               0.0
jasjam                 0.0
jauh                   0.0
jaskknc                0.0
jasminator             0.0
jassell                0.0
jata                   0.0
jati                   0.0
jatoba                 0.0
jatt                   0.0
jatuh                  0.0
jaunty                 0.0
javon                  0.0
javafx                 0.0
javanese               0.0
javascr                0.0
javascript             0.0
javelin                0.0
javi                   0.0
javier                 0.0
javoedge               0.0
zzcase                 0.0


In [75]:
def apply_tf_idf(X):
    transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
    transformer.fit(X)
    return transformer

In [76]:
apply_tf_idf(train_set)

NameError: name 'TfidfTransformer' is not defined

## Word2Vector

In [None]:
class MyCorpus(object):
    
    def __init__(self, positive_corpus,negative_corpus):
        
        self.positive_corpus = positive_corpus
        self.negative_corpus = negative_corpus
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        
        for line in open(self.positive_corpus):            
            yield utils.simple_preprocess(line)
        for line in open(self.negative_corpus):
            yield utils.simple_preprocess(line)

#### do not run this cell again : run the next one to load the model

In [29]:
"""
neural embedding model
"""
def we_word2vector_create_model(path_train_pos, path_train_neg) :
    
    corpus = MyCorpus(path_train_pos, path_train_neg)
    model = Word2Vec(sentences = corpus)
    model.save('word2vec_saved_model.joblib')
    model.wv.save("word2vec.wordvectors")

    return model

#### load the model 

In [None]:
# for path_train_pos = ..\\Resources\\train_pos.txt
# for path_train_heg = ..\\Resources\\train_neg.txt
def we_word2vector_load_model() :
    return gensim.models.Word2Vec.load('word2vec_saved_model.joblib')

## GloVe

In [None]:
"Glove: Global Vectors for Word Representation"

def we_glove(train_set) : 
    # model = list of sentences
    model = api.load("glove-twitter-25")
    #corpus = 1 list of words
    corpus = list(itertools.chain.from_iterable(model))
    