In [5]:
import pandas as pd
import gensim
from tqdm import tqdm
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk


[nltk_data] Downloading package punkt to /Users/ange/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ange/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
news = pd.read_csv("news_data/news_data_preprocessed.csv")

In [7]:
import ast
# Convert string representations of lists back into actual lists
news['headline_lemmas'] = news['headline_lemmas'].apply(ast.literal_eval)
news['snippet_lemmas'] = news['snippet_lemmas'].apply(ast.literal_eval)

In [8]:
# Construct a dictionary of words from the corpus
dictionary = gensim.corpora.Dictionary(news['headline_lemmas'])

# Filter the dictionary to meet frequency thresholds
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)

# Convert the document into the bag-of-words format
corpus_headline = [dictionary.doc2bow(doc) for doc in news['headline_lemmas']]

In [9]:
from gensim.models import LdaMulticore
lda_headline = gensim.models.LdaMulticore(corpus_headline, num_topics=10, id2word=dictionary, passes=2, workers=2)
topics_headline = lda_headline.show_topics(num_topics=10, num_words=10, log=True, formatted=False)

# print the top words associated with each topic (top 10)
for topic_num, topic_words in topics_headline:
  topic_rep = " ".join ([f"{w}({p:.4f})" for w,p in topic_words])
  print (f"HeadlineTopic {topic_num}: {topic_rep}")

HeadlineTopic 0: nyt(0.0170) front(0.0041) page(0.0041) men(0.0016) weekend(0.0015) facebook(0.0014) clinton(0.0013) campaign(0.0013) climate(0.0013) de(0.0013)
HeadlineTopic 1: climate(0.0018) weekend(0.0017) debate(0.0017) iran(0.0015) clinton(0.0015) market(0.0014) impeachment(0.0013) men(0.0013) turn(0.0013) protest(0.0013)
HeadlineTopic 2: clinton(0.0019) fashion(0.0019) weekend(0.0016) climate(0.0016) debate(0.0014) spring(0.0014) street(0.0014) men(0.0014) u(0.0014) market(0.0014)
HeadlineTopic 3: clinton(0.0030) hillary(0.0023) weekend(0.0022) obama(0.0019) spring(0.0017) shooting(0.0016) debate(0.0015) brooklyn(0.0014) fashion(0.0014) sander(0.0014)
HeadlineTopic 4: spring(0.0019) weekend(0.0017) college(0.0016) iran(0.0015) monday(0.0014) climate(0.0014) u(0.0014) market(0.0013) made(0.0013) russian(0.0013)
HeadlineTopic 5: weekend(0.0024) debate(0.0016) syria(0.0016) market(0.0016) clinton(0.0015) obama(0.0015) michael(0.0014) brooklyn(0.0014) supreme(0.0014) last(0.0014)
He

In [10]:
# Construct a dictionary of words from the 'snippet_lemmas'
dictionary_snippet = gensim.corpora.Dictionary(news['snippet_lemmas'])

# Filter the dictionary to meet frequency thresholds
dictionary_snippet.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)

# Convert the document into the bag-of-words format
corpus_snippet = [dictionary_snippet.doc2bow(doc) for doc in news['snippet_lemmas']]

# Fit LDA model on the snippet corpus
lda_snippet = gensim.models.LdaMulticore(corpus_snippet, num_topics=10, id2word=dictionary_snippet, passes=2, workers=2)
topics_snippet = lda_snippet.show_topics(num_topics=10, num_words=10, log=True, formatted=False)

# Print the top words associated with each topic for snippets
for topic_num, topic_words in topics_snippet:
  topic_rep = " ".join([f"{w}({p:.4f})" for w,p in topic_words])
  print(f"SnippetTopic {topic_num}: {topic_rep}")


SnippetTopic 0: nyt(0.0027) front(0.0017) page(0.0017) guide(0.0015) bride(0.0013) groom(0.0013) tuesday(0.0013) saturday(0.0013) clinton(0.0013) thursday(0.0012)
SnippetTopic 1: tuesday(0.0019) wednesday(0.0016) friday(0.0016) thursday(0.0014) guide(0.0014) obama(0.0012) fashion(0.0012) monday(0.0012) clinton(0.0011) saturday(0.0011)
SnippetTopic 2: friday(0.0015) guide(0.0014) wednesday(0.0014) monday(0.0012) fashion(0.0012) tuesday(0.0012) clinton(0.0012) thursday(0.0011) saturday(0.0011) designer(0.0010)
SnippetTopic 3: tuesday(0.0018) monday(0.0018) friday(0.0012) fashion(0.0011) thursday(0.0011) spring(0.0011) bride(0.0011) wednesday(0.0011) shooting(0.0010) writes(0.0010)
SnippetTopic 4: tuesday(0.0020) monday(0.0016) wednesday(0.0014) clinton(0.0014) guide(0.0014) ahead(0.0013) thursday(0.0013) shooting(0.0011) saturday(0.0010) robert(0.0009)
SnippetTopic 5: monday(0.0015) clinton(0.0015) wednesday(0.0014) guide(0.0014) thursday(0.0014) friday(0.0012) saturday(0.0011) spring(0.

In [11]:
# Create an empty list to store the topic distributions
headlines_topics = []

# Iterate over each document in the corpus
for doc_bow in corpus_headline:
    # Get the topic distribution for the document
    doc_topics = lda_headline.get_document_topics(doc_bow, minimum_probability=0.0)
    # Extract the topic probabilities
    topic_probs = [prob for _, prob in doc_topics]
    # Append to the list
    headlines_topics.append(topic_probs)

# Convert the list to a DataFrame
headlines_topic_df = pd.DataFrame(headlines_topics, columns=[f"Headline_Topic_{i}" for i in range(10)])

# Add the publication date for reference
headlines_topic_df['pub_date'] = news['pub_date'].values

In [12]:
# Create an empty list to store the topic distributions for snippets
snippets_topics = []

# Iterate over each document in the snippet corpus
for doc_bow in corpus_snippet:
    # Get the topic distribution for the document
    doc_topics = lda_snippet.get_document_topics(doc_bow, minimum_probability=0.0)
    # Extract the topic probabilities
    topic_probs = [prob for _, prob in doc_topics]
    # Append to the list
    snippets_topics.append(topic_probs)

# Convert the list to a DataFrame
snippets_topic_df = pd.DataFrame(snippets_topics, columns=[f"Snippet_Topic_{i}" for i in range(10)])

# Add the publication date for reference
snippets_topic_df['pub_date'] = news['pub_date'].values


In [13]:
# Create an empty list to store the topic distributions for snippets
snippets_topics = []

# Iterate over each document in the snippet corpus
for doc_bow in corpus_snippet:
    # Get the topic distribution for the document
    doc_topics = lda_snippet.get_document_topics(doc_bow, minimum_probability=0.0)
    # Extract the topic probabilities
    topic_probs = [prob for _, prob in doc_topics]
    # Append to the list
    snippets_topics.append(topic_probs)

# Convert the list to a DataFrame
snippets_topic_df = pd.DataFrame(snippets_topics, columns=[f"Snippet_Topic_{i}" for i in range(10)])

# Add the publication date for reference
snippets_topic_df['pub_date'] = news['pub_date'].values


In [15]:
# Merge the headline topics with the original DataFrame
news_with_headline_topics = pd.merge(news, headlines_topic_df, on='pub_date', how='left')

# Merge the snippet topics with the DataFrame that already has headline topics
final_news_df = pd.merge(news_with_headline_topics, snippets_topic_df, on='pub_date', how='left')


In [21]:
final_news = final_news_df.to_csv("news_data/lda.csv", index=False)