### Importing important libs

In [1]:
import os
import nltk
import pandas as pd
from functools import reduce
from IPython.display import display, Markdown
from random import randint
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Downloading NLTK data to use later
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gustavo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Importing initial web scrapping result

In [2]:
LOCATION = os.getcwd()
ARTICLES_CSV = os.path.join(LOCATION, "articles.csv")
SCRAP_TOPICS_DIR = os.path.join(LOCATION, "scrap_topics")

# articles.csv columns
TITLE = 'Title'
LINK = 'Link'
BYLINE = 'Byline'
DATE = 'Date'
AUTHOR = 'Author'


raw_csv_df = pd.read_csv(ARTICLES_CSV, encoding='utf-8')
    
title_byline_df = raw_csv_df[[TITLE, BYLINE]].copy(deep=True)

print(title_byline_df.head(1))

                                               Title  \
0  Elon Musk Launching Cellphone To Compete With ...   

                                              Byline  
0  Rumor has it the "Tesla Phone" will have seaml...  


### Creating the word count

In [3]:
def normalize_text(text: str) -> str:
    return ("".join(ch for ch in text if ch.isalnum() or ch.isspace()).lower())

stop_words = set(nltk.corpus.stopwords.words('english'))

word_list = []
for index, row in title_byline_df.iterrows():
    if not pd.isnull(row[TITLE]):
        formatted_title = normalize_text(row[TITLE])
        word_list.extend(filter(
            lambda word: word not in stop_words,
            formatted_title.split()
        ))

    if not pd.isnull(row[BYLINE]):
        formatted_byline = normalize_text(row[BYLINE])
        word_list.extend(filter(
            lambda word: word not in stop_words,
            formatted_byline.split()
        ))
            
word_df = pd.DataFrame(word_list, columns=['Word'])
word_agg_df = word_df.groupby('Word').size().reset_index(name='Count')
word_agg_df.sort_values(by='Count', ascending=False, inplace=True)

print(f"Words found on scrap: {len(word_agg_df)}")
display(Markdown(word_agg_df.head(5).to_markdown(index=False)))

Words found on scrap: 32989


| Word      |   Count |
|:----------|--------:|
| trump     |    2309 |
| us        |    1711 |
| president |    1620 |
| show      |    1423 |
| video     |    1395 |

### Process meaning words and term frequencies 

In [4]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned text data
tfidf_matrix = tfidf_vectorizer.fit_transform(word_agg_df["Word"])

# Convert the TF-IDF matrix to a dataframe
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF dataframe
display(tfidf_df)

Unnamed: 0,000,007,007themed,02,0233,030725,045,05,050,07,...,zuccotti,zuckerberg,zuckerbergs,zuckerman,zulican,zunzuncito,zurich,zxt,álvaro,širokibrijeg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
random_state = randint(0, 100) # 79 68
number_of_topics = 10

# Initialize LDA
lda = LatentDirichletAllocation(n_components=number_of_topics, random_state=random_state)

# Fit LDA model to the TF-IDF matrix
lda.fit(tfidf_matrix)

# Get the words associated with each topic
n_top_words = 5
feature_names = tfidf_vectorizer.get_feature_names_out()

topics_dict= {}
for topic_index, topic in enumerate(lda.components_):
    topics_dict[topic_index] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        
print(f"Using random state: {random_state}")
for topic_index, topic_words in topics_dict.items():
    print(f"Topic {topic_index}: {topic_words}")

Using random state: 36
Topic 0: ['diego', 'hermine', 'selfmutilation', 'ateba', 'aluminum']
Topic 1: ['hitmen', 'interact', 'interior', 'mosaics', 'sderot']
Topic 2: ['jilted', 'getrank', 'biafran', 'jewelers', 'twas']
Topic 3: ['outback', 'simpson', 'fill', 'postwar', 'filtration']
Topic 4: ['kawaguchi', 'inconsolable', 'incontinent', 'increased', 'terrorists']
Topic 5: ['000', 'seldom', 'unwelcome', 'secre', 'attorneys']
Topic 6: ['love', 'oktoberfest', 'basis', 'allmetal', 'allocate']
Topic 7: ['širokibrijeg', 'crayons', 'usafa', 'lis', 'lipton']
Topic 8: ['socia', 'dubiously', 'dramatically', 'dramatized', 'bounced']
Topic 9: ['donkeys', 'servitude', 'ranted', 'touring', 'deleting']


### Creating new CSVs with topic words only

In [7]:
def insert_article(word_list: list, text_to_check: str) -> bool:
    if pd.isnull(text_to_check): return False
    
    norm_text = normalize_text(text_to_check)
    
    return reduce(
        lambda acc, word: acc or (word in norm_text),
        word_list,
        False
    )

if not os.path.exists(SCRAP_TOPICS_DIR):
    os.makedirs(SCRAP_TOPICS_DIR)
    
for topic_index, topic_words in topics_dict.items():
    csv_dict = []
    
    for index, row in raw_csv_df.iterrows():
        if insert_article(topic_words, row[TITLE]) or insert_article(topic_words, row[BYLINE]):
            csv_dict.append({
                TITLE: row[TITLE],
                BYLINE: row[BYLINE],
                LINK: row[LINK],
                DATE: row[DATE],
                AUTHOR: row[AUTHOR]
            })
            
    pd.DataFrame(csv_dict).to_csv(
        os.path.join(SCRAP_TOPICS_DIR, f"scrap_topic_{topic_index}.csv"), 
        encoding='utf-8', 
        index=False
    )

### Creating CSVs with custom topics

In [8]:
word_list = [
    "openai",
    "photograph",
    "media",
    "viral",
    "image",
    "photo",
    "online",
    "facebook",
    "tweet",
    "post",
    "photographs",
    "meme",
    "twitter",
    "picture",
    "scam",
    "content",
    "internet",
]

for index, row in raw_csv_df.iterrows():
    if insert_article(word_list, row[TITLE]) or insert_article(word_list, row[BYLINE]):
        csv_dict.append({
            TITLE: row[TITLE],
            BYLINE: row[BYLINE],
            LINK: row[LINK],
            DATE: row[DATE],
            AUTHOR: row[AUTHOR]
        })

pd.DataFrame(csv_dict).to_csv(
    os.path.join(SCRAP_TOPICS_DIR, "custom_scrap_topic.csv"), 
    encoding='utf-8', 
    index=False
)