<a href="https://colab.research.google.com/github/HimashiRathnayake/Hate-Speech-Humor-Detection/blob/branch-1/Generate_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Parameters**

In [41]:
embedding_type = "fasttext" #@param["word2vec", "fasttext"]
embedding_size = 100 #@param [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

**Folder Paths**

In [48]:
folder_path = "/content/drive/Shareddrives/FYP/"
data_path = folder_path + "corpus/data.csv"
embedding_models_save_path = folder_path + "embedding_models/" + embedding_type + "/embedding_" + embedding_type + "_" + str(embedding_size)
word_embedding_keydvectors_path = folder_path + "embedding_models/" + embedding_type + "/keyed_vectors/" +  "embedding_" + embedding_type + "_" + str(embedding_size)

**Dependencies**

In [43]:
from __future__ import print_function
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import pickle
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText
import nltk
import re
from tqdm import tqdm

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Load Data**

In [44]:
all_data = pd.read_csv(data_path)
comments = list(all_data["comment"]);

**Tokenize Data**

In [45]:
en_stop = set(nltk.corpus.stopwords.words('english'))

# Lemmatization
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

#text cleaning
def process_text(document):


    # Remove extra white space from text
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Remove all the special characters from text
    # document = re.sub(r'\W', ' ', str(document))

    # Remove all single characters from text
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Converting to Lowercase

    document = document.lower()

    # Word tokenization
    tokens = document.split()

    # Lemmatization using NLTK
    lemma_txt = [stemmer.lemmatize(word) for word in tokens]

    # # Remove stop words
    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
    # # Drop words
    tokens = [word for word in tokens if len(word) > 3]

    clean_txt = ' '.join(tokens)

    return clean_txt

clean_corpus = [process_text(sentence) for sentence in tqdm(comments) if sentence.strip() !='']

word_tokens = [nltk.word_tokenize(sent) for sent in tqdm(clean_corpus)]

100%|██████████| 13947/13947 [00:00<00:00, 17433.17it/s]
100%|██████████| 13947/13947 [00:01<00:00, 7182.00it/s]


**Word Embeddings**

**Train word embedding models**

In [46]:
# define training data
sentences = word_tokens


In [50]:
# train model
if (embedding_type == "word2vec"):
  model = Word2Vec(sentences, size=embedding_size, window=5, min_count=1, workers=4)
elif (embedding_type == "fasttext"):
  model = FastText(sentences, size=embedding_size, window=5, min_count=1, workers=4)

word_vectors = model.wv

# summarize the loaded model
print(model)

# summarize vocabulary
words = list(model.wv.vocab)
print(words)

# save model & word vectors
model.save(embedding_models_save_path)
word_vectors.save(word_embedding_keydvectors_path)

FastText(vocab=30022, size=100, alpha=0.025)
['rate', 'neethiya', 'muta', 'balapannethida', 'mobitel', 'short', 'wadakma', 'na', ',', 'signal', 'mathugama', '0781298113', 'සිගනල්', 'එන්නෙ', 'hutch', 'routers', 'satelites', 'fiber', 'line', 'හදුන්වා', 'දෙන්න', 'බලාපොරොත්තු', 'නැද්ද', '😜අනිවා', 'දුන්නොත්', 'mytv', 'මතකයි', '!', 'thisara', 'sangakkara', 'kusal', 'mendis', 'game', 'gamers', 'mercantile', 'e-sports', 'championship', '2017', 'powered', 'dialog', 'gaming', '.', 'play', 'games', 'fabulous', 'gifts', '15th', '16th', 'july', 'maradana', 'trace', 'expert', 'city', 'from', '9.00', 'onwards', 'visit', 'www.dialog.lk/gaming', 'register', 'more', 'information', '#', 'dialoggaming', 'horata', 'denna', 'manussayo', 'lanka', 'telecom', 'very', 'service', 'inform', 'several', 'time', 'change', 'broadband', 'other', 'package', 'customer', 'centre', 'said', 'cant', 'system', 'fail', 'finally', 'decide', 'disconnected', 'internet', 'connection', 'takeover', 'letter', 'kotte', 'branch', '(',

**Load Model**

In [None]:
# load model

if (embedding_type == "word2vec"):
  new_model = Word2Vec.load(embedding_models_save_path)
elif (embedding_type == "fasttext"):
  new_model = FastText.load(embedding_models_save_path)

# # check model
new_model.wv.most_similar("na")