In [31]:
import pandas as pd
import spacy
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet

In [3]:
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

In [13]:
# 加载拼写检查器
spell = SpellChecker()

In [14]:
# Step 1: 词形归一化（Lemmatization）
def lemmatize_word(word):
    doc = nlp(word.lower())
    return doc[0].lemma_  # 只取第一个单词的 Lemma

In [15]:
# Step 2: 拼写检查（Correct Spelling）
def correct_spelling(word):
    corrected = spell.correction(word)
    return corrected if corrected else word  # 如果拼写检查找不到，就返回原始单词

In [55]:
def get_derivational_root(word):
    """使用 WordNet 找到派生词的根词（尝试转换为动词或名词）"""
    synsets = wordnet.synsets(word)
    
    for syn in synsets:
        for lemma in syn.lemmas():
            related_forms = lemma.derivationally_related_forms()
            for related in related_forms:
                root_word = related.name()
                
                related_synsets = wordnet.synsets(root_word)
                for related_syn in related_synsets:
                    if related_syn.pos() in ['v', 'n']:  # 只接受动词或名词
                        return root_word

    return word  # 如果找不到更合适的词根，则保持原样

In [32]:
# Step 3: 数据内部的同义词归一化
def get_internal_synonyms(values, threshold=0.8):
    """在数据集中找到相似的单词并归类"""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(values)
    
    # 计算所有单词的余弦相似度
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # 构建同义词归类映射
    synonym_map = {}
    words = np.array(values)

    for i in range(len(words)):
        for j in range(i + 1, len(words)):
            if cosine_sim[i, j] > threshold:  # 相似度超过阈值
                root_word = words[i]  # 以较短的单词作为基准
                synonym_map[words[j]] = root_word  # 归一化

    return synonym_map

In [41]:
# Step 4: 清理文本
def clean_text(text):
    words = text.split()  # 按空格拆分单词
    cleaned_words = []
    for word in words:
        corrected = correct_spelling(word)  # 先拼写检查
        lemma = lemmatize_word(corrected)  # 再做 Lemmatization
        cleaned_words.append(corrected)
    return " ".join(cleaned_words)  # 重新拼接成句子

In [42]:
# 读取 Excel 文件
file_path = "/Users/jiazhengtian/Desktop/Combined_data.xlsx"  # 替换成你的文件路径
df = pd.read_excel(file_path)

In [29]:
# 组合 From & To 列并去重，获取唯一值
values = list(set(df["From"].astype(str)) | set(df["To"].astype(str)))

In [39]:
synonym_map = get_internal_synonyms(values,threshold=0.7)
synonym_df = pd.DataFrame(list(synonym_map.items()), columns=["Original", "Standardized"])

In [44]:
# 处理 From 和 To 列
df["From_Cleaned"] = df["From"].astype(str).apply(lambda x: clean_text(x))
df["To_Cleaned"] = df["To"].astype(str).apply(lambda x: clean_text(x))

# 保存到新 Excel 文件
output_path = "cleaned_values.xlsx"
df.to_excel(output_path, index=False)

In [40]:
synonym_df

Unnamed: 0,Original,Standardized
0,Friendship,friendship
1,Friends & Family,Family/Friends
2,Friends + Family,Friends & Family
3,Family + Friends,Friends + Family
4,Family & Friends,Family + Friends
...,...,...
73,Feeling alive,Feeling Alive
74,accomplishment,Accomplishment
75,Happiness,Happiness/Love
76,Stong Work Ethic,Work Ethic


In [45]:
lemmatize_word("adapting")

'adapt'

In [53]:
lemma_word = lemmatize_word("adaptability")

In [56]:
get_derivational_root(lemma_word)

'adaptability'