In [2]:
# Now I have a folder including the negative texts
# ready to perform TF-IDF analysis to find out which terms are most important in each document

from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import glob
import pandas as pd

# 1. Load files
directory_path = "../negative_DeepSeek_texts"
text_files = glob.glob(f"{directory_path}/*.txt")

texts = []
titles = []

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        texts.append(f.read())
        titles.append(Path(filepath).stem)  # file name without extension

# 2. TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.6, min_df=1)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

# 3. Find top 10 words for each document
top_n = 10
records = []

for doc_idx, title in enumerate(titles):
    row = tfidf_matrix[doc_idx].toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]  # top 10
    for rank, i in enumerate(top_indices, start=1):
        records.append({
            "document": title,
            "rank": rank,
            "term": feature_names[i],
            "tfidf": row[i]
        })
# 4. create the tfidf DataFrame
top_tfidf = pd.DataFrame(records)
print(top_tfidf[:10])


                                document  rank          term     tfidf
0  Global_AI_rivalry_is_a_dangerous_game     1           agi  0.409378
1  Global_AI_rivalry_is_a_dangerous_game     2         level  0.174159
2  Global_AI_rivalry_is_a_dangerous_game     3         human  0.163648
3  Global_AI_rivalry_is_a_dangerous_game     4        altman  0.130918
4  Global_AI_rivalry_is_a_dangerous_game     5  policymakers  0.130619
5  Global_AI_rivalry_is_a_dangerous_game     6     strategic  0.130619
6  Global_AI_rivalry_is_a_dangerous_game     7          risk  0.130619
7  Global_AI_rivalry_is_a_dangerous_game     8          plan  0.130619
8  Global_AI_rivalry_is_a_dangerous_game     9          term  0.111649
9  Global_AI_rivalry_is_a_dangerous_game    10            xi  0.111649


In [8]:
# Save data
top_tfidf.to_csv("../outputs/tfidf_negative_texts.csv", index=False, encoding="utf-8")

In [4]:

import altair as alt

from pathlib import Path
import glob

# 5. Load CSV file tfidf DataFrame we created in 
top_tfidf = pd.read_csv('../outputs/tfidf_negative_texts.csv')

# If you want to specify the order of documents on the y-axis
titles = top_tfidf['document'].unique().tolist()

# 4. Optional: highlight some terms
term_list = ['political']  

# 5. Altair heatmap
base = alt.Chart(top_tfidf).encode(
    x=alt.X('rank:O', title='Top Rank in Document'),
    y=alt.Y('document:N', title='Document', sort=titles)
)

heatmap = base.mark_rect().encode(
    color=alt.Color('tfidf:Q', scale=alt.Scale(scheme='blues'), title='Negative texts:TF-IDF Score')
)

circle = base.mark_circle(size=100).encode(
    color=alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')
    )
)

text = base.mark_text(baseline='middle', fontSize=10).encode(
    text='term:N',
    color=alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('black')
    )
)

chart = (heatmap + circle + text).properties(
     width=600, 
     height=200,
     title="TF-IDF Heatmap of Negative Texts"
)
chart
