# Imports

In [1]:
import pandas as pd
from gensim.models.fasttext import FastText

# Load Data

In [2]:
folder_path = '/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/'
lankadeepa_data_all = folder_path + 'corpus/new/preprocess_from_isuru/lankadeepa_comments_with_article_2.csv'
gossip_lanka_data_all = folder_path + 'corpus/new/preprocess_from_unicode_values/gossip_lanka_all_comments.csv'

num_features = 450
context = 5
fasttext_model_path = folder_path + "word_embedding/fasttext/data_from_gosspiLanka_and_lankadeepa/450/fastText_" \
                 + str(num_features) + "_" + str(context)

## Concat Datasets

In [3]:
lankadeepa_data = pd.read_csv(lankadeepa_data_all)
gossipLanka_data = pd.read_csv(gossip_lanka_data_all)

In [4]:
lankadeepa_comments = []
lankadeepa_articles = []
gossipLanka_comments = gossipLanka_data['comment']

for index, row in lankadeepa_data.iterrows():
  article = row['article']
  if isinstance(article, str) :
    lankadeepa_articles.append(article.replace('\u200d','').replace('\u200b',''))
  comments = []
  for comment in row['comment'].split(';'):
    comments.append(comment.replace('\u200d',''))
  lankadeepa_comments.extend(comments)



In [5]:
lankadeepa_comments = pd.Series(lankadeepa_comments)
lankadeepa_articles = pd.Series(lankadeepa_articles)
gossipLanka_comments = gossipLanka_data['comment']

In [6]:
all_data = pd.concat([lankadeepa_comments, lankadeepa_articles, gossipLanka_comments])

# Generate FastText

In [7]:
def generate_model(data,context,num_features,save_path):
    comments = []
    for comment in all_data:
        comments += to_separate_sentences(comment)

    print("# of comments taken for building the model: " + str(len(comments)))

    downsampling = 1e-3  # Downsample setting for frequent words
    min_word_count = 1  # Minimum word count - if not occurred this much remove
    num_workers = 4  # Number of threads to run in parallel

    model = FastText(comments, workers=num_workers, size=num_features, min_count=min_word_count,
                              window=context, sample=downsampling, sg=1, iter=50)
    # model.init_sims(replace=True)  # If you don't plan to train the model any further

    return model

In [8]:
# split a comment into sentences of words
def to_separate_sentences(comment):
    sentences = []
    raw_sentences = str(comment).split(".")
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 2:
            sentences.append(raw_sentence.split())
    return sentences

In [9]:
def check_model_qulity(model, word):
    similar_words = []
    for s in model.most_similar(word):
      similar_words.append(s[0])
      # print(s[0])
    return similar_words

In [10]:
model = generate_model(all_data,context,num_features,fasttext_model_path)

# of comments taken for building the model: 970328


In [None]:
model.save(fasttext_model_path)
check_model_qulity(model, 'නැහැ')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  This is separate from the ipykernel package so we can avoid doing imports until


# Load Model

In [None]:
model = FastText.load(fasttext_model_path)

In [None]:
similar_words = check_model_qulity(model,'ඉහලම')
similar_words