In [65]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import nltk
import spacy
import os , re 
import gensim
from gensim.models import LdaModel , CoherenceModel , HdpModel , LdaMulticore, Nmf
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
import pyLDAvis
import pyLDAvis.gensim

In [51]:
df = pd.read_csv("../data/cleaned_data/cleaned_for_classical_nlp.csv")
df.dropna(inplace=True)
docs = df['summ_text'].sample(frac=0.05)


In [52]:
len(docs)

5407

In [53]:
def gen_words(texts):
    final = []
    for text in texts:
        final.extend([simple_preprocess(word, min_len=2 , max_len=40) for word in text.split()])
    return final
        

In [54]:
words = gen_words(docs.values.tolist())

In [55]:
len(words).__format__(",")

'184,529'

In [56]:
id2word = Dictionary(documents=words)

In [57]:
len(id2word)

33953

In [58]:
id2word.filter_extremes(no_below=3 , no_above=0.7)
print(len(id2word))

10143


In [59]:
corpus = []
for word in words:
    new = id2word.doc2bow(word)
    corpus.append(new)
    

In [60]:
lda_model = LdaModel(corpus=corpus , 
                     id2word=id2word,
                     num_topics=15 ,
                     chunksize=100,
                     passes=10 , alpha='auto')

In [62]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus=corpus, dictionary = id2word , mds ='mmds' )
vis



In [63]:
lda_model = LdaModel(corpus=corpus , 
                     id2word=id2word,
                     num_topics=7 ,
                     chunksize=100,
                     passes=10 , alpha='auto')

In [64]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus=corpus, dictionary = id2word , mds ='mmds' )
vis



In [68]:
nmf_model = Nmf(corpus=corpus , 
                     id2word=id2word,
                     num_topics=7 ,
                     chunksize=100,
                     passes=10 
                )

In [75]:
nmf_model.show_topics(num_topics=7)

[(0,
  '0.642*"خصوصا" + 0.155*"وفي" + 0.102*"الظروف" + 0.041*"بسبب" + 0.030*"يكون" + 0.018*"أفضل" + 0.003*"اللاعب" + 0.003*"عزيز" + 0.003*"أنه" + 0.001*"بطولة"'),
 (1,
  '0.994*"العالم" + 0.002*"اللاعبين" + 0.002*"الصفوف" + 0.001*"تواجه" + 0.000*"بقاء" + 0.000*"أنه" + 0.000*"منح" + 0.000*"الفريقين" + 0.000*"ونحن" + 0.000*"إريك"'),
 (2,
  '0.998*"الوطنية" + 0.001*"البيضاء" + 0.000*"أغلب" + 0.000*"اللاعبين" + 0.000*"المدرب" + 0.000*"أنها" + 0.000*"محمد" + 0.000*"الإفريقية" + 0.000*"اللاعب" + 0.000*"يغلب"'),
 (3,
  '0.950*"بطولة" + 0.046*"إضافة" + 0.001*"المستوى" + 0.000*"منتخب" + 0.000*"المتقدمة" + 0.000*"البطولة" + 0.000*"اللاعبين" + 0.000*"مستواه" + 0.000*"الظروف" + 0.000*"وطني"'),
 (4,
  '0.995*"المباريات" + 0.002*"المدرب" + 0.002*"عزيز" + 0.000*"إيريك" + 0.000*"يلي" + 0.000*"اللاعب" + 0.000*"صحيح" + 0.000*"البطولات" + 0.000*"طبيعي" + 0.000*"سيأتي"'),
 (5,
  '0.998*"الفريق" + 0.000*"قال" + 0.000*"بسبب" + 0.000*"صائب" + 0.000*"مناطق" + 0.000*"إحداث" + 0.000*"تحتل" + 0.000*"جيد" + 0.000

In [76]:
lda_model_coherence = CoherenceModel(model=lda_model , texts=id2word , corpus=corpus , coherence="c_v")

In [78]:
print(lda_model_coherence.get_coherence())

In [None]:
nmf_model_coherence = CoherenceModel(model=nmf_model , texts=id2word , corpus=corpus , coherence="c_v")
print(nmf_model_coherence.get_coherence())

In [None]:
#find optimal numbrers of topics