In [1]:
import os
os.getcwd()


'/Users/lulu/Documents/GitHub/deepseek-news-analysis/DeepSeek_Media_Analysis/3_TF-IDF'

In [4]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import glob

# 1. Load files
directory_path = "../1_Datacollection/corpus_deepseek"
text_files = glob.glob(f"{directory_path}/*.txt")

texts = []
titles = []

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        texts.append(f.read())
        titles.append(Path(filepath).stem)  # file name without extension

# 2. TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.6, min_df=1)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

# 3. Find top 10 words for each document
top_n = 10
records = []

for doc_idx, title in enumerate(titles):
    row = tfidf_matrix[doc_idx].toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]  # top 10
    for rank, i in enumerate(top_indices, start=1):
        records.append({
            "document": title,
            "rank": rank,
            "term": feature_names[i],
            "tfidf": row[i]
        })
# 4. create the tfidf DataFrame
top_tfidf = pd.DataFrame(records)
print(top_tfidf[:10])


                                            document  rank          term  \
0  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     1       america   
1  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     2      industry   
2  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     3         spend   
3  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     4       company   
4  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     5       leaving   
5  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     6    leadership   
6  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     7  capabilities   
7  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     8   development   
8  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     9         chips   
9  What_is_DeepSeek,_the_Chinese_AI_startup_that_...    10       doubled   

      tfidf  
0  0.162779  
1  0.145042  
2  0.141637  
3  0.136892  
4  0.115065  
5  0.114453  
6  0.111769  
7  0.110674  
8  0.107724  
9  0.105845  


In [6]:
# Save data
top_tfidf.to_csv("./outputs/top_tfidf_results.csv", index=False, encoding="utf-8")

In [13]:

import altair as alt

from pathlib import Path
import glob

# 5. Load CSV file tfidf DataFrame we created in 
top_tfidf = pd.read_csv('./outputs/top_tfidf_results.csv')

# If you want to specify the order of documents on the y-axis
titles = top_tfidf['document'].unique().tolist()

# 4. Optional: highlight some terms
term_list = ['power', 'security']  

# 5. Altair heatmap
base = alt.Chart(top_tfidf).encode(
    x=alt.X('rank:O', title='Top Rank in Document'),
    y=alt.Y('document:N', title='Document', sort=titles)
)

heatmap = base.mark_rect().encode(
    color=alt.Color('tfidf:Q', scale=alt.Scale(scheme='blues'), title='TF-IDF Score')
)

circle = base.mark_circle(size=100).encode(
    color=alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')
    )
)

text = base.mark_text(baseline='middle', fontSize=10).encode(
    text='term:N',
    color=alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('black')
    )
)

chart = (heatmap + circle + text).properties(width=600, height=2000)
chart
