# 훈련 데이터에 대하여 모든 피쳐 추출

## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import textstat
from collections import Counter # 품사 태그 계산 Counter
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

from sklearn.model_selection import train_test_split

## Data

In [2]:
data = pd.read_excel("/root/Seminar/paper/combined_data_NLP.xlsx")
data = data.rename({"question":"Question", "attempt":"Attempt", "answer":"Answer"}, axis = 1)

`-` 기초 통계량 계산

In [6]:
def count_unique_words(text):
    """텍스트 내 고유 단어 수 계산 (간단 전처리 포함)"""
    if not isinstance(text, str): return 0
    words = re.sub(r'[^a-z0-9\s]', '', str(text).lower()).split()
    return len(set(w for w in words if w))

def cal_LD(text):
    tokens = word_tokenize(text.lower())
    tagged = pos_tag(tokens)

    content_tags = {'NN','NNS','NNP','NNPS', ## Nouns
                    'VB','VBD','VBG','VBN','VBP','VBZ', ## Verbs
                    'JJ','JJR','JJS', ## Adjectives
                    'RB','RBR','RBS'} ## Adverbs
    content_words = [word for word, tag in tagged if tag in content_tags]

    if len(tokens) == 0:
        return 0
    return len(content_words) / len(tokens)

def cal_prp(text):
    tokens = word_tokenize(text.lower())
    tagged = pos_tag(tokens)

    pronouns = [word for word, tag in tagged if tag == "PRP"]

    if len(tokens) == 0:
        return 0
    return len(pronouns) / len(tokens)

In [7]:
data['Answer_clean'] = data['Answer'].fillna('')
data['unique_word_count'] = data['Answer_clean'].apply(count_unique_words)
data["personal_pronoun_proportion"] = data["Answer"].apply(cal_prp)
data["lexical_density"] = data["Answer"].apply(cal_LD)
data["grade_level"] = data["Answer"].apply(textstat.flesch_kincaid_grade)

In [10]:
data.to_csv("/root/Seminar/paper/basic_features.xlsx", index = False)

`-` 자료 분할

In [2]:
data = pd.read_csv("/root/Seminar/paper/basic_features.xlsx")
df_perp = pd.read_csv("/root/Seminar/paper/Perplexity/Llama_perp.csv")
df_topics = pd.read_excel("/root/Seminar/paper/LDA/llm_subtopic_distribution_v2.xlsx")
df_lda = pd.read_excel("/root/Seminar/paper/LDA/llm_subtopic_distribution_v2.xlsx")
data = data.assign(perplexity = df_perp.perp)

In [4]:
df_full = pd.concat([data, df_lda.iloc[:, 1:-1]], axis = 1)
df_full.to_csv("extracted_features.csv", index = False)