# 특성 중요도 조사

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import joblib

import sys
sys.path.append("..")
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
from ml_editor.data_processing import (
    format_raw_df,
    get_split_by_author,  
    add_text_features_to_df,
    get_vectorized_series, 
    get_feature_vector_and_label
)
from ml_editor.model_evaluation import get_feature_importance

data_path = Path('./data/processed/writers/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

In [3]:
df = add_text_features_to_df(df.loc[df["is_question"]].copy())
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

In [4]:
model_path = Path("./models/model_1.pkl")
clf = joblib.load(model_path) 
vectorizer_path = Path("./models/vectorizer_1.pkl")
vectorizer = joblib.load(vectorizer_path)

In [5]:
train_df["vectors"] = get_vectorized_series(train_df["full_text"].copy(), vectorizer)
test_df["vectors"] = get_vectorized_series(test_df["full_text"].copy(), vectorizer)

features = [
                "action_verb_full",
                "question_mark_full",
                "text_len",
                "language_question",
            ]
X_train, y_train = get_feature_vector_and_label(train_df, features)
X_test, y_test = get_feature_vector_and_label(test_df, features)

In [6]:
w_indices = vectorizer.get_feature_names()
w_indices.extend(features)
all_feature_names = np.array(w_indices)

In [11]:
k = 10
print("상위 %s개 중요도:\n" % k)
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[:k]]))

print("\n하위 %s개 중요도:\n" % k)
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[-k:]]))

상위 10개 중요도:

text_len: 0.0098
are: 0.0046
what: 0.0042
can: 0.0041
writing: 0.0041
ve: 0.0039
with: 0.0038
do: 0.0038
as: 0.0036
on: 0.0036

하위 10개 중요도:

unresolved: 0
cycles: 0
persecuted: 0
ignores: 0
thoughtful: 0
thor: 0
persuaded: 0
forgive: 0
pervasive: 0
persisted: 0


앞선 top-k 분석과 마찬가지로 현재 모델에서 텍스트의 길이를 중요한 특성으로 사용하고 있는 것을 알 수 있다.