In [17]:
!/usr/local/anaconda3/envs/numpy_env/bin/pip install vl-convert-python

Collecting vl-convert-python
  Downloading vl_convert_python-1.8.0-cp37-abi3-macosx_10_12_x86_64.whl.metadata (5.2 kB)
Downloading vl_convert_python-1.8.0-cp37-abi3-macosx_10_12_x86_64.whl (30.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.0/30.0 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: vl-convert-python
Successfully installed vl-convert-python-1.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/anaconda3/envs/numpy_env/bin/python -m pip install --upgrade pip[0m


In [19]:
import os
os.getcwd()


'/Users/lulu/Documents/GitHub/Media_Text_Analysis/3_TF-IDF'

In [21]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import glob

# 1. Load files
directory_path = "../1_Datacollection/corpus_deepseek"
text_files = glob.glob(f"{directory_path}/*.txt")

texts = []
titles = []

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        texts.append(f.read())
        titles.append(Path(filepath).stem)  # file name without extension

# 2. TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.6, min_df=1)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

# 3. Find top 10 words for each document
top_n = 10
records = []

for doc_idx, title in enumerate(titles):
    row = tfidf_matrix[doc_idx].toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]  # top 10
    for rank, i in enumerate(top_indices, start=1):
        records.append({
            "document": title,
            "rank": rank,
            "term": feature_names[i],
            "tfidf": row[i]
        })
# 4. create the tfidf DataFrame
top_tfidf = pd.DataFrame(records)
print(top_tfidf[:10])


                                            document  rank          term  \
0  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     1       america   
1  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     2      industry   
2  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     3         spend   
3  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     4       leaving   
4  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     5    leadership   
5  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     6  capabilities   
6  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     7   development   
7  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     8         chips   
8  What_is_DeepSeek,_the_Chinese_AI_startup_that_...     9       doubled   
9  What_is_DeepSeek,_the_Chinese_AI_startup_that_...    10    investment   

      tfidf  
0  0.162264  
1  0.146921  
2  0.143096  
3  0.116127  
4  0.115610  
5  0.113004  
6  0.111998  
7  0.109125  
8  0.106839  
9  0.105219  


In [23]:
# Save data
top_tfidf.to_csv("./outputs/top_tfidf_results.csv", index=False, encoding="utf-8")

In [40]:

import altair as alt

from pathlib import Path
import glob

# 5. Load CSV file tfidf DataFrame we created in 
top_tfidf = pd.read_csv('./outputs/top_tfidf_results.csv')

# If you want to specify the order of documents on the y-axis
titles = top_tfidf['document'].unique().tolist()

# 4. Optional: highlight some terms
term_list = ['power', 'security']  

# 5. Altair heatmap
base = alt.Chart(top_tfidf).encode(
    x=alt.X('rank:O', title='Top Rank in Document'),
    y=alt.Y('document:N', title='Document', sort=titles)
)

heatmap = base.mark_rect().encode(
    color=alt.Color('tfidf:Q', scale=alt.Scale(scheme='blues'), title='TF-IDF Score')
)

circle = base.mark_circle(size=100).encode(
    color=alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')
    )
)

text = base.mark_text(baseline='middle', fontSize=10).encode(
    text='term:N',
    color=alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('black')
    )
)

chart = (heatmap + circle + text).properties(
    width=600,
    height=2000,
    title="tfidf_results_all_texts"
).configure_title(
    fontSize=20,
    font="Arial",
    anchor="start",   # "start" | "middle" | "end"
    color="black"
)


chart


In [42]:
# Save as PNG
chart.save("visualizations/tfidf_all_texts.png")