In [None]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary

'''
Use LDA instead of NMF because the coherence score of the LDA model is higher, which means that the topics of the LDA model are more coherent and easier to interpret.
'''


def nmf_coherence_scorer(estimator, X):
    tfidf_matrix = estimator.named_steps['tfidf'].transform(X)  # To TF-IDF matrix
    nmf_model = estimator.named_steps['nmf']
    H = nmf_model.components_

    texts = [doc.split() for doc in X] 
    dictionary = Dictionary(texts)  

    # Calculate coherence
    coherence_model = CoherenceModel(
        topics=[[dictionary[word_id] for word_id in topic.argsort()[:-11:-1]] for topic in H],
        texts=texts, dictionary=dictionary, coherence='c_v', processes=1)
    coherence = coherence_model.get_coherence()
    return coherence

if __name__ == '__main__':
    df = pd.read_csv('../data/processed/tweets_with_topics.csv')

    # data cleaning
    df['cleaned_text'] = df['cleaned_text'].fillna('')
    df['cleaned_text'] = df['cleaned_text'].astype(str)

    # iliterate the parameters to find the best parameters
    final_params = {
        'tfidf__max_features': 6000,   # 2000, 3000, 4000 has lower coherence score
        'tfidf__ngram_range': (1, 2),  # (1,1), (1,3) has lower coherence score 
        'tfidf__max_df': 0.9,  # 0.95, 0.85 has lower coherence score
        'tfidf__min_df': 10,  # 5, 15 has lower coherence score
        'nmf__n_components': 10,  # 10 = 0.36615630438929847 
    }

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('nmf', NMF(random_state=1, max_iter=500))  # at 200 a warning was given
    ])


    pipeline.set_params(**final_params)
    pipeline.fit(df['cleaned_text'])


    coherence = nmf_coherence_scorer(pipeline, df['cleaned_text'])
    print(f"Coherence: {coherence}")

    W = pipeline.named_steps['nmf'].transform(pipeline.named_steps['tfidf'].transform(df['cleaned_text']))

    for i in range(pipeline.named_steps['nmf'].n_components):
        df[f'topic_{i}'] = W[:, i]

    joblib.dump(pipeline, 'final_nmf_model.pkl')

    df.to_csv('../data/processed/tweets_with_final_nmf_topics_final.csv', index=False)
