In [None]:
! pip3 --version

In [None]:
! pip3 install deep-translator   

In [None]:
! pip3 install gensim

In [None]:
! pip3 install nltk

In [None]:
! pip3 install spacy

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
! python3 -m spacy download en_core_web_sm

In [12]:
import re
import spacy
from nltk.corpus import wordnet as wn

nlp = spacy.load("en_core_web_sm")
is_multiword=re.compile(r"[ \-\_]|[^A-Za-z']")
def is_english_word(lemma):
    return len(wn.synsets(lemma)) > 0
def filter_english_variants(base_word, neighbors):
    base_lemma = nlp(base_word)[0].lemma_.lower()
    filtered = {}
    for w, sim in neighbors:
        
        lemma = nlp(w)[0].lemma_.lower()
        if is_multiword.search(w) or lemma == base_lemma or not is_english_word(lemma):
            continue
        
        if lemma not in filtered:
            filtered[lemma] = sim
    return filtered


In [16]:
from gensim.models import KeyedVectors


In [25]:
ft_zh = KeyedVectors.load_word2vec_format('cc.zh.300.vec')


In [None]:
neighbors = ft_zh.most_similar("民主", topn=10)
print(neighbors)

In [None]:
ft_en = KeyedVectors.load_word2vec_format('cc.en.300.vec')
neighbors = ft_en.most_similar("democracy", topn=10)
print(neighbors)

In [None]:
from deep_translator import GoogleTranslator

translated = GoogleTranslator(source='zh-CN', target='en').translate("民主")
print(translated)

In [29]:
def translate_and_filter(base_word_zh, neighbors_zh):
    translated = []
    for zh_word, sim in neighbors_zh:
        en = GoogleTranslator(source='zh-CN', target='en').translate(zh_word)
        translated.append((en.lower(), sim))
    base_word = GoogleTranslator(source='zh-CN', target='en').translate(base_word_zh)
    if is_multiword.search(base_word):
      return None
    return filter_english_variants(base_word, translated)

In [30]:
def jaccard_top_k(dict1, dict2, k):
    top_k_1 = {w for w, sim in sorted(dict1.items(), key=lambda x: x[1], reverse=True)[:k]}
    top_k_2 = {w for w, sim in sorted(dict2.items(), key=lambda x: x[1], reverse=True)[:k]}

    intersection = top_k_1 & top_k_2
    union = top_k_1 | top_k_2

    if len(union)==0:
        return 0.0

    return len(intersection) / len(union)

In [77]:
def compare_words(en_word, ch_word=None):
  try:
    if ch_word is None:
      ch_word = GoogleTranslator(source='en', target='zh-CN').translate(en_word)
    neighbors_zh = ft_zh.most_similar(ch_word, topn=100)
    filtered_zh = translate_and_filter(ch_word, neighbors_zh)
    neighbors = ft_en.most_similar(en_word, topn=100)
    filtered_neighbors = filter_english_variants(en_word, neighbors)
    print(neighbors_zh)
    print(filtered_zh)
    print(neighbors)
    print(filtered_neighbors)
    jaccard = jaccard_top_k(filtered_zh, filtered_neighbors, 10)
    return filtered_zh, filtered_neighbors, jaccard
  except Exception as e:
    print("Error:", e)
    return None, None, None

In [None]:
compare_words("democracy", "民主")

In [None]:
compare_words("China", "中国")

In [None]:
compare_words("republic")

In [None]:
compare_words("Xinjiang", "新疆")

In [None]:
compare_words("blue", "蓝")

In [None]:
compare_words("red", "红")

In [None]:
compare_words("constitutionalism")

In [55]:
from tqdm import tqdm


In [68]:
words =[]


In [None]:

results = []
for w in tqdm(words):
    zh, en, jac = compare_words(w)
    results.append({
        "word": w,
        "filtered_zh": zh,
        "filtered_en": en,
        "jaccard": jac
    })

In [70]:
import json
with open("word_comparisons.json", "w") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

In [None]:
compare_words("legislative")

In [102]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def polarity_by_cosine(word_en):
    

    try:
        good_en="good"
        bad_en="bad"
        good_zh="好"
        bad_zh="坏"
        
        ch_word = GoogleTranslator(source='en', target='zh-CN').translate(word_en)

        v_en       = ft_en[word_en]
        v_good_en  = ft_en[good_en]
        v_bad_en   = ft_en[bad_en]

        v_zh       = ft_zh[ch_word]
        v_good_zh  = ft_zh[good_zh]
        v_bad_zh   = ft_zh[bad_zh]

        sim_good_en = cosine_similarity(v_en, v_good_en)
        sim_bad_en  = cosine_similarity(v_en, v_bad_en)

        sim_good_zh = cosine_similarity(v_zh, v_good_zh)
        sim_bad_zh  = cosine_similarity(v_zh, v_bad_zh)

        if sim_good_en > sim_bad_en :
            polarity_en = "good"
        else:
            polarity_en = "bad"

        if sim_good_zh > sim_bad_zh:
            polarity_zh = "good"
        else:
            polarity_zh = "bad"
            
        return {
            "english": polarity_en,
            "chinese": polarity_zh,
            "sim_en": (sim_good_en, sim_bad_en),
            "sim_zh": (sim_good_zh, sim_bad_zh)
        }

    except Exception as e:
        print("Error:", e)
        return None


In [None]:
result = polarity_by_cosine("annoyed")
print(result)

In [116]:
def run_polarity_over_list(word_list):

    details = []
    english_good = 0
    english_bad = 0
    chinese_good = 0
    chinese_bad = 0
    agreement = 0
    disagreement = 0

    for w in word_list:
        res = polarity_by_cosine(w)

        if res is None:
            continue

        details.append({"word": w, **res})

        if res["english"] == "good":
            english_good += 1
        else:
            english_bad += 1

        if res["chinese"] == "good":
            chinese_good += 1
        else:
            chinese_bad += 1

        # Agreement vs disagreement
        if res["english"] == res["chinese"]:
            agreement += 1
        else:
            disagreement += 1

    summary = {
        "english_good": english_good,
        "english_bad": english_bad,
        "chinese_good": chinese_good,
        "chinese_bad": chinese_bad,
        "agreement": agreement,
        "disagreement": disagreement
    }

    return {
        "details": details,
        "summary": summary
    }


In [None]:
words = []




results = run_polarity_over_list(words)

print(results["summary"])

In [135]:
import csv

details = results["details"]

with open("words_results.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow([
        "word",
        "english",
        "chinese",
        "sim_good_en",
        "sim_bad_en",
        "sim_good_zh",
        "sim_bad_zh"
    ])

    for d in details:
        sim_en_good, sim_en_bad = d["sim_en"]
        sim_zh_good, sim_zh_bad = d["sim_zh"]

        writer.writerow([
            d["word"],
            d["english"],
            d["chinese"],
            sim_en_good,
            sim_en_bad,
            sim_zh_good,
            sim_zh_bad
        ])


In [None]:
! pip3 install pandas

In [None]:
import pandas as pd
import os

filenames = [
    "ideology_polarity_results.csv",
    "legal_polarity_results.csv",
    "institution_polarity_results.csv",
    "movement_polarity_results.csv",
    "political_theory_polarity_results.csv",
    "neutral_polarity_results.csv",
    "process_polarity_results.csv",
    "policy_polarity_results.csv"
]

dfs = []

for file in filenames:
    if os.path.exists(file):
        try:
            df = pd.read_csv(file)
            df['category'] = file[:-21]
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")
    else:
        print(f"File not found: {file}")

if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    display(combined_df)
else:
    print("No dataframes")

In [None]:
combined_df

In [None]:

word_counts = combined_df["word"].value_counts()
repeated_words = word_counts[word_counts > 1]
print(repeated_words)

In [4]:
combined_df["english_good"] = combined_df["english"]=="good"
combined_df["chinese_good"] = combined_df["chinese"]=="good"

In [None]:
! pip3 install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plot_data = combined_df.melt(id_vars=['category'], value_vars=['english_good', 'chinese_good'], var_name='Language', value_name='Is_Good')
plot_data['Is_Good'] = plot_data['Is_Good'].astype(int)
plt.figure(figsize=(12, 6))
sns.barplot(data=plot_data, x='category', y='Is_Good', hue='Language', ci=None)
plt.title('Percentage of Words Marked Closer to Good than Bad by Category and Language')
plt.ylabel('Proportion Marked as Good (0.0 to 1.0)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
correlations = combined_df.groupby('category').apply(lambda x: x['english_good'].corr(x['chinese_good'])).reset_index(name='correlation')
correlations = correlations.sort_values('correlation', ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(data=correlations, x='correlation', y='category', palette='coolwarm')
plt.axvline(0, color='black', linestyle='--') 
plt.title('Correlation between being Marked as Good in English and Chinese')
plt.xlabel('Pearson Correlation Coefficient')
plt.tight_layout()
plt.show()