In [6]:
import os
os.chdir("C:\\path")

%run -i "libraries.py"
%run -i "functions.py"

# 1. File Reading

In [11]:
try:
    data = pd.read_csv("NYT-merge.csv", sep=";", encoding="UTF-8")
    datar.drop("Unnamed: 0", axis=1, inplace=True)
    print("Loaded comments: %s" %(len(data)))
    
except FileNotFoundError:
    print("Merge file not found, loading every .csv")
    
    articles_path = "C:\\Users\\Hp\\Desktop\\Public Project\\Articles\\*.csv"
    comments_path = "C:\\Users\\Hp\\Desktop\\Public Project\\Comments\\*.csv"
    articles_list = glob.glob(articles_path)
    comments_list = glob.glob(comments_path)

    
    # Loading data
    articles = get_articles(articles_path, articles_list)
    comments = get_comments(comments_path, comments_list)

    # Merge
    data = pd.merge(articles, comments, left_on="artID", right_on="comID", how="left").drop("comID", axis=1)

    
    data = data.dropna() # around 100k rows do not match any article ID
    data = data.reset_index()
    data = data.drop("index", axis=1)
    data = data[data.Keywords != ''] # remove empty keywords
    data = data.drop_duplicates(subset=["Comments"], keep="first")
    print("Loaded comments: %s" %(len(data)))

Loaded comments: 2040273


## 1.1 Preparation

In [13]:
# Create dictionary and corpus for LDA
(key_dictionary, key_corpus, base_keywords) = prepare_lda(data["Keywords"])

# 2. LDA with grid search 

In [14]:
%%time
num_topic = [3, 4, 5, 6, 7, 8, 9, 10]
list_perplexity = []
list_top_topics = []
list_coherence = []

for topic in num_topic:
    
    # chunksize and iterations chosen after some tests
    ldamodel = gensim.models.LdaMulticore(key_corpus, id2word=key_dictionary, num_topics=topic, 
                                          workers=3, random_state=0, chunksize=100000, iterations=500)
    
    list_perplexity.append(ldamodel.log_perplexity(key_corpus, total_docs=len(key_corpus)))
    list_top_topics.append(ldamodel.top_topics(corpus=key_corpus, dictionary=key_dictionary, topn=10, coherence='u_mass'))
    
    coherence_model_lda = CoherenceModel(model=ldamodel, corpus=key_corpus, dictionary=key_dictionary, coherence='u_mass')
    coherence_lda = coherence_model_lda.get_coherence()
    list_coherence.append(coherence_lda)
    
    ldamodel.save('model-' + str(topic) + 'topic.gensim') # saving models
    
    # perplexity, coherence and top topics
    print("Model perplexity with", str(topic), " topics: ", 
          list_perplexity[-1], "\tCoherence Score: ", str(list_coherence[-1]))

# save a df from printing the top topics in details
lda_df = pd.DataFrame({"Num_topics":num_topic, "log_perplexity":list_perplexity,
                       "coherence":list_coherence, "Top_topics":list_top_topics})
print(lda_df)

Model perplexity with 3  topics:  -6.27055906475321 	Coherence Score:  -8.904744746944028
Model perplexity with 4  topics:  -6.273841949631126 	Coherence Score:  -7.839636735084103
Model perplexity with 5  topics:  -6.290800916586298 	Coherence Score:  -8.57770486394762
Model perplexity with 6  topics:  -6.31968177006147 	Coherence Score:  -9.204861426995416
Model perplexity with 7  topics:  -6.39035490703285 	Coherence Score:  -9.284320430552762
Model perplexity with 8  topics:  -6.468102714817781 	Coherence Score:  -9.779813053130422
Model perplexity with 9  topics:  -6.57297240150778 	Coherence Score:  -9.686333776043528
Model perplexity with 10  topics:  -6.714813899974233 	Coherence Score:  -9.832438748547872
   Num_topics                                         Top_topics  coherence  \
0           3  [([(0.12381317, 'Trump, Donald J'), (0.1234036...  -8.904745   
1           4  [([(0.11763931, 'Trump, Donald J'), (0.1069188...  -7.839637   
2           5  [([(0.13376634, 'United 

## 2.1 Plot topics

In [32]:
for i in range(len(lda_df)):
    for topic in lda_df["Top_topics"].iloc[i]:
        print("topic")
        for key in topic[0]:
            print(key, end="\n")
    print("End lda model \n\n\n")

topic
(0.12381317, 'Trump, Donald J')
(0.12340368, 'United States Politics and Government')
(0.0359334, 'Republican Party')
(0.027915658, 'Russia')
(0.023131058, 'House of Representatives')
(0.0227823, 'Presidential Election of 2016')
(0.019189112, 'United States International Relations')
(0.017299274, 'Health Insurance and Managed Care')
(0.016083615, 'Patient Protection and Affordable Care Act (2010)')
(0.01552975, 'Federal Bureau of Investigation')
topic
(0.04557359, 'Trump, Donald J')
(0.028926758, 'United States Politics and Government')
(0.01755291, 'Gun Control')
(0.01725678, 'Immigration and Emigration')
(0.015975377, 'School Shootings and Armed Attacks')
(0.015589056, 'Politics and Government')
(0.013539489, 'United States International Relations')
(0.011817302, 'Parkland, Fla, Shooting (2018)')
(0.011557237, 'Refugees and Displaced Persons')
(0.011079109, 'Executive Orders and Memorandums')
topic
(0.08116547, 'Trump, Donald J')
(0.066884704, 'United States Politics and Govern

topic
(0.08019142, 'United States Politics and Government')
(0.061414693, 'Trump, Donald J')
(0.027119035, 'Democratic Party')
(0.026418062, 'Elections, House of Representatives')
(0.022643417, 'Women and Girls')
(0.02084095, 'Labor and Jobs')
(0.015264834, 'Midterm Elections (2018)')
(0.015100112, 'Republican Party')
(0.014728219, 'Global Warming')
(0.014115898, 'United States Economy')
topic
(0.02905207, 'Social Media')
(0.026239999, 'Facebook Inc')
(0.025733178, 'Supreme Court (US)')
(0.021145951, 'Television')
(0.016825035, 'Trump, Donald J')
(0.015160096, 'Children and Childhood')
(0.014859847, 'News and News Media')
(0.013916718, 'Law and Legislation')
(0.013538518, 'Data-Mining and Database Marketing')
(0.013520975, 'Cambridge Analytica')
topic
(0.058711663, 'Trump, Donald J')
(0.042084612, 'United States Politics and Government')
(0.024916528, 'Fox News Channel')
(0.024592623, 'Deferred Action for Childhood Arrivals')
(0.023266433, 'Sexual Harassment')
(0.018536689, 'Ethics and