## Count files

In [1]:
import json

# 파일 경로를 지정해주세요
file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_not_a_term_doc.json"

# JSON 파일을 불러와 항목의 개수를 세기
with open(file_path, "r") as file:
    data = json.load(file)

item_count = len(data)
print("항목의 개수:", item_count)


항목의 개수: 31576


## Extract only one sentence with the word is not a word from the whole document

In [None]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

input_file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_a_made_up_word_doc.json"
output_file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_a_made_up_word_complete_sentence.json"

def extract_complete_sentence_with_query(input_file, output_file, query="is a made up word"):
    sentences_with_query = [] 
    
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # pipe를 사용하여 일괄 처리
    texts = [" ".join(span[0] for span in document.get("spans", [])) for document in data]
    docs = nlp.pipe(texts, batch_size=100)  
    
    for doc in docs:
        for sentence in doc.sents:
            if query in sentence.text:
                sentences_with_query.append(sentence.text.strip())
                break

    with open(output_file, 'w') as f:
        json.dump(sentences_with_query, f, indent=4)
    
    print(f"Extracted complete sentences saved to {output_file}")

extract_complete_sentence_with_query(input_file_path, output_file_path)

Extracted complete sentences saved to /home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_a_made_up_word_complete_sentence.json


## Check the arc 

In [6]:
import json
import spacy
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")

input_file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_not_a_word_complete_sentence.json"

def group_arcs_by_dependency(input_file, target_word="word"):
    arc_groups = defaultdict(list) 
    
    with open(input_file, 'r') as f:
        sentences = json.load(f)
    
    for sentence in sentences:
        doc = nlp(sentence)
        
        for token in doc:
            if token.text.lower() == target_word:
                for child in token.children:
                    arc_groups[child.dep_].append(child.text)
    
    return arc_groups

arc_groups = group_arcs_by_dependency(input_file_path)

for arc, examples in arc_groups.items():
    print(f"Arc '{arc}': Examples -> {examples[:5]}")

Arc 'det': Examples -> ['a', 'a', 'a', 'a', 'a']
Arc 'relcl': Examples -> ['recognized', 'seen', 'embrace', 'use', 'hear']
Arc 'prep': Examples -> ['in', 'about', 'on', 'in', 'about']
Arc 'acl': Examples -> ['found', 'describe', 'is', 'used', 'limited']
Arc 'punct': Examples -> [',', ',', ':', "'", "'"]
Arc 'appos': Examples -> ['word', '=', 'word[::-2', 'word', 'translation']
Arc 'neg': Examples -> ['not', 'not', 'not', 'not', 'not']
Arc 'amod': Examples -> ['single', 'appropriate', '-for', 'nice', '-for']
Arc 'ccomp': Examples -> ['is', 'endures', 'seemed', 'is', 'is']
Arc 'cc': Examples -> ['but', 'but', 'but', 'but', 'and']
Arc 'conj': Examples -> ['vibration', 'way', 'translation', 'image', 'great']
Arc 'advmod': Examples -> ['even', 'anywhere', 'all', 'all', 'here']
Arc 'nummod': Examples -> ['-by', '-by', 'one', '1', 'one']
Arc 'poss': Examples -> ['Dictionaries', 'His', 'Dictionaries', 'my', 'week']
Arc 'dep': Examples -> ['\n', '\n    ', '\n', '\n', '\n    ']
Arc 'compound': E

## Save to csv file (filtering)

In [15]:
import json
import csv
import spacy
from nltk.corpus import wordnet, stopwords
import nltk
import re
import os

# 데이터 병합

# Ensure that you have downloaded wordnet and stopwords data
# nltk.download("wordnet")
# nltk.download("stopwords")

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Load stop words from NLTK
stop_words = set(stopwords.words("english"))

def is_real_word(word, tag):
    # Clean the word by removing punctuation and stripping whitespace
    word_cleaned = re.sub(r'[^\w\s]', '', word.lower()).strip()
    
    # Check if the word is a stop word, too short, or a proper noun
    if word_cleaned in stop_words or len(word_cleaned) < 3 or tag in ["NNP", "NNPS"]:
        return True  # Treat as "real word" to skip adding to not_words list
    
    # Check if the cleaned word exists in WordNet
    return wordnet.synsets(word_cleaned)

def extract_not_words_from_sentences(sentences):
    not_words = []
    
    for sentence in sentences:
        doc = nlp(sentence)
        for token in doc:
            # Identify the "word" token in the structure "X is not a word"
            if token.text.lower() == "word" and token.dep_ == "attr":   # 문장이 바뀔때 수정해야함
                # Check if "is" exists as an auxiliary verb right before "not"
                if token.head.text.lower() == "is" and any(child.text.lower() == "not" for child in token.head.children):
                    # Identify "X" as the token immediately before "is"
                    is_token_index = token.head.i
                    if is_token_index > 0:
                        x_token = doc[is_token_index - 1]
                        x_word = x_token.text

                        has_right_arcs = any(child.i > token.i for child in token.children)

                        # Only apply basic checks for WordNet existence and stop words
                        if not has_right_arcs and not is_real_word(x_word, x_token.tag_):
                            print(f"Adding '{x_word}' to not_words list.")
                            not_words.append((x_word, sentence))  # Save the word along with the original sentence
                        else:
                            print(f"Skipping '{x_word}' (exists in WordNet, is a common word/stop word, or is a proper noun).")
    
    return not_words

def load_existing_csv(filename):
    existing_data = []
    if os.path.exists(filename):
        with open(filename, mode="r") as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            existing_data = [(row[1], row[2]) for row in reader]  # Only 'Not Word' and 'Original Sentence'
    return existing_data

def save_not_words_to_csv(not_words, filename="/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_a_made_up_words.csv"):
    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["ID", "Not Word", "Original Sentence"])  # Write header with three columns
        for idx, (word, sentence) in enumerate(not_words, start=1):
            writer.writerow([idx, word, sentence])  # Write each row with ID, word, and original sentence

# 기존 CSV 파일에서 데이터 로드
output_csv_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_not_a_words.csv"
existing_data = load_existing_csv(output_csv_path)

# 새로운 JSON 파일에서 데이터 추출
input_file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/complete_sentence/is_a_made_up_complete_sentence.json"
with open(input_file_path, 'r') as file:
    sentences = json.load(file)
new_not_words = extract_not_words_from_sentences(sentences)

# 기존 데이터와 새로운 데이터 병합
all_not_words = existing_data + new_not_words

# 병합된 데이터를 CSV 파일에 저장
save_not_words_to_csv(all_not_words, output_csv_path)
print(f"Not words updated and saved to {output_csv_path}")


Skipping '”' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping 'This' (exists in WordNet, is a common word/stop word, or is a proper noun).
Adding 'Porculent' to not_words list.
Adding 'youse' to not_words list.
Skipping '"' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping 'that' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping 'this' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping 'Festuche' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping '"' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping ''' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping 'that' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping '’' (exists in WordNet, is a common word/stop word, or is a proper noun).
Skipping 'Jenny' (exists in WordNet, is a common word/stop word, or is a proper

## Remove duplicated word

In [None]:
import pandas as pd

# CSV 파일 경로 설정
csv_file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/is_not_a_words.csv"

# CSV 파일 읽기
df = pd.read_csv(csv_file_path)

# 'Not Word' 컬럼의 값을 소문자로 변환하여 중복 제거 (첫 번째 값만 남김)
df = df.drop_duplicates(subset=df['Not Word'].str.lower(), keep='first')

# 중복이 제거된 데이터프레임을 CSV 파일에 다시 저장
df.to_csv(csv_file_path, index=False)
print(f"Duplicates removed and saved to {csv_file_path}")


## Check the file

In [2]:
import pandas as pd

# Define the path to the CSV file
csv_file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/not_words.csv"

# Load the CSV file into a DataFrame
not_words_df = pd.read_csv(csv_file_path)

# Display the DataFrame
not_words_df[:20]


Unnamed: 0,ID,Not Word
0,1,Aprecious
1,2,bithought
2,3,steroidic
3,4,meltaway
4,5,Ficking
5,6,Counterment
6,7,Oodle
7,8,collectivities
8,9,thrival
9,10,unspeak


In [None]:
import pandas as pd
from hunspell import HunSpell
import Levenshtein

h = HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')

file_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/blocking words/blocking_words.csv" # modify your file path

df = pd.read_csv(file_path)


def load_dictionary(dic_path):
    with open(dic_path, 'r') as f:
        return [line.strip() for line in f.readlines() if line.strip()]

dictionary = load_dictionary("/usr/share/hunspell/en_US.dic")

def is_spacing_error(word):
    for i in range(1, len(word)):  
        part1, part2 = word[:i], word[i:]
        if h.spell(part1) and h.spell(part2):  
            return True
    return False

def is_typo_or_spacing_error(word, dictionary, threshold=2):    # threshold
    if is_spacing_error(word):  
        return False
    if h.spell(word):  
        return False
    closest_word, distance = get_closest_word(word, dictionary)
    return distance > threshold  

def get_closest_word(word, dictionary):
    closest_word = min(dictionary, key=lambda x: Levenshtein.distance(word, x))
    distance = Levenshtein.distance(word, closest_word)
    return closest_word, distance

filtered_words = []

for _, row in df.iterrows():
    word = row['Not Word']
    if is_typo_or_spacing_error(word, dictionary):  
        filtered_words.append(row)

filtered_df = pd.DataFrame(filtered_words)

output_path = "/home/work/jupyter/minwoo/CMU/LLM_blocking/data/blocking words/filtered_blocking_words.csv"
filtered_df.to_csv(output_path, index=False)


문법적으로 틀린 단어만 저장 완료: filtered_grammar_errors.csv
