<a href="https://colab.research.google.com/github/HimashiRathnayake/Hate-Speech-Humor-Detection/blob/branch-1/Generate_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Parameters**

In [32]:
embedding_type = "fasttext" #@param["word2vec", "fasttext"]
embedding_size = 100 #@param [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

**Folder Paths**

In [33]:
folder_path = "/content/drive/Shared drives/FYP/"
data_path = folder_path + "corpus/data.csv"
embedding_models_save_path = folder_path + "embedding_models/" + "embedding_" + embedding_type + "_" + str(embedding_size)

**Dependencies**

In [43]:
from __future__ import print_function
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import pickle
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText
import nltk
import re
from tqdm import tqdm

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Load Data**

In [35]:
all_data = pd.read_csv(data_path)
comments = list(all_data["comment"]);

**Tokenize**

In [36]:
en_stop = set(nltk.corpus.stopwords.words('english'))

# Lemmatization
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

#text cleaning
def process_text(document):


    # Remove extra white space from text
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Remove all the special characters from text
    # document = re.sub(r'\W', ' ', str(document))

    # Remove all single characters from text
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Converting to Lowercase

    document = document.lower()

    # Word tokenization
    tokens = document.split()

    # Lemmatization using NLTK
    lemma_txt = [stemmer.lemmatize(word) for word in tokens]

    # # Remove stop words
    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
    # # Drop words
    tokens = [word for word in tokens if len(word) > 3]

    clean_txt = ' '.join(tokens)

    return clean_txt

clean_corpus = [process_text(sentence) for sentence in tqdm(comments) if sentence.strip() !='']

word_tokens = [nltk.word_tokenize(sent) for sent in tqdm(clean_corpus)]


  0%|          | 0/13947 [00:00<?, ?it/s][A
 11%|‚ñà         | 1528/13947 [00:00<00:00, 15277.14it/s][A
 22%|‚ñà‚ñà‚ñè       | 3007/13947 [00:00<00:00, 15124.58it/s][A
 33%|‚ñà‚ñà‚ñà‚ñé      | 4653/13947 [00:00<00:00, 15500.78it/s][A
 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 6107/13947 [00:00<00:00, 15198.65it/s][A
 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 7483/13947 [00:00<00:00, 14733.68it/s][A
 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 9040/13947 [00:00<00:00, 14974.62it/s][A
 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 10672/13947 [00:00<00:00, 15350.09it/s][A
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13947/13947 [00:00<00:00, 15718.22it/s]

  0%|          | 0/13947 [00:00<?, ?it/s][A
  5%|‚ñç         | 695/13947 [00:00<00:01, 6947.54it/s][A
 10%|‚ñâ         | 1392/13947 [00:00<00:01, 6954.10it/s][A
 15%|‚ñà‚ñç        | 2080/13947 [00:00<00:01, 6931.29it/s][A
 19%|‚ñà‚ñâ        | 2693/13947 [00:00<00:01, 6667.06it/s][A
 24%|‚ñà‚ñà‚ñç       | 3341/13947 [00:00<00:01, 6602.11it/s][A
 28%|‚ñà‚ñà‚ñä       | 3953/13947 [00:00<0

**Word Embeddings**

Train word vectors

In [37]:
# define training data
sentences = word_tokens


In [38]:
# train model
if (embedding_type == "word2vec"):
  model = Word2Vec(sentences, size=embedding_size, window=5, min_count=1, workers=4)
elif (embedding_type == "fasttext"):
  model = FastText(sentences, size=embedding_size, window=5, min_count=1, workers=4)

# summarize the loaded model
print(model)

# summarize vocabulary
words = list(model.wv.vocab)
print(words)

# save model
# model.save('model.bin')
pickle.dump(model, open(embedding_models_save_path , 'wb'))


FastText(vocab=30022, size=100, alpha=0.025)
['rate', 'neethiya', 'muta', 'balapannethida', 'mobitel', 'short', 'wadakma', 'na', ',', 'signal', 'mathugama', '0781298113', '‡∑É‡∑í‡∂ú‡∂±‡∂Ω‡∑ä', '‡∂ë‡∂±‡∑ä‡∂±‡∑ô', 'hutch', 'routers', 'satelites', 'fiber', 'line', '‡∑Ñ‡∂Ø‡∑î‡∂±‡∑ä‡∑Ä‡∑è', '‡∂Ø‡∑ô‡∂±‡∑ä‡∂±', '‡∂∂‡∂Ω‡∑è‡∂¥‡∑ú‡∂ª‡∑ú‡∂≠‡∑ä‡∂≠‡∑î', '‡∂±‡∑ê‡∂Ø‡∑ä‡∂Ø', 'üòú‡∂Ö‡∂±‡∑í‡∑Ä‡∑è', '‡∂Ø‡∑î‡∂±‡∑ä‡∂±‡∑ú‡∂≠‡∑ä', 'mytv', '‡∂∏‡∂≠‡∂ö‡∂∫‡∑í', '!', 'thisara', 'sangakkara', 'kusal', 'mendis', 'game', 'gamers', 'mercantile', 'e-sports', 'championship', '2017', 'powered', 'dialog', 'gaming', '.', 'play', 'games', 'fabulous', 'gifts', '15th', '16th', 'july', 'maradana', 'trace', 'expert', 'city', 'from', '9.00', 'onwards', 'visit', 'www.dialog.lk/gaming', 'register', 'more', 'information', '#', 'dialoggaming', 'horata', 'denna', 'manussayo', 'lanka', 'telecom', 'very', 'service', 'inform', 'several', 'time', 'change', 'broadband', 'other', 'package', 'customer', 'centre', 'said', 'cant', 'system',

In [53]:
# load model
f = open(embedding_models_save_path, 'rb')
loaded_embedding = pickle.load(f)

# if (embedding_type == "word2vec"):
#   new_model = Word2Vec.load(loaded_embedding)
# elif (embedding_type == "fasttext"):
#   new_model = FastText.load(loaded_embedding)

print(loaded_embedding)

# # check model
loaded_embedding.wv.most_similar("na")

FastText(vocab=30022, size=100, alpha=0.025)


[('kapana', 0.9999942183494568),
 ('kana', 0.9999933838844299),
 ('kranawa', 0.9999932646751404),
 ('kapannathuwa', 0.9999929666519165),
 ('kalpana', 0.9999929070472717),
 ('nidiyanawa', 0.999992847442627),
 ('kiyanawa.mun', 0.9999927878379822),
 ('kiyana', 0.9999926090240479),
 ('yanawa', 0.9999924898147583),
 ('nawa', 0.9999923706054688)]