# 표 형식 특성 백터화 하기

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append("..")
import warnings
warnings.filterwarnings('ignore')

from ml_editor.data_processing import get_normalized_series

data_path = Path("../data/processed/writers/writers.csv", index_col=0)
df = pd.read_csv(data_path)

태그, 코맨트 개수, 질문 날짜로부터 점수를 예측

In [3]:
df["is_question"] = df["PostTypeId"] == 1

tabular_df = df[df["is_question"]][["Tags", "CommentCount", "CreationDate", "Score"]]
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,31
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,23
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,36
3,<plot><short-story><planning><brainstorming>,1,2010-11-18T20:43:59.693,34
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,22


데이터를 모델의 입력으로 사용하기 위해 수치표현으로 변경.    
1. 수치 입력 특성을  정규화, 이상치 영향 줄이기
2. 날짜 특성을 모델이 이해하기 쉬운 형태로 변형
3. 모델이 범주형 특성을 이해할 수 잇도록 더미 변수로 바꿈

In [4]:
tabular_df["NormComment"] = get_normalized_series(tabular_df, "CommentCount")
tabular_df["NormScore"] = get_normalized_series(tabular_df, "Score")

In [5]:
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score,NormComment,NormScore
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,31,1.344496,3.303389
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,23,-0.882157,2.283499
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,36,-0.564064,3.940821
3,<plot><short-story><planning><brainstorming>,1,2010-11-18T20:43:59.693,34,-0.564064,3.685848
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,22,-0.564064,2.156013


In [6]:
tabular_df["date"] = pd.to_datetime(tabular_df["CreationDate"])

tabular_df["year"] = tabular_df["date"].dt.year
tabular_df["month"] = tabular_df["date"].dt.month
tabular_df["day"] = tabular_df["date"].dt.day
tabular_df["hour"] = tabular_df["date"].dt.hour

In [7]:
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score,NormComment,NormScore,date,year,month,day,hour
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,31,1.344496,3.303389,2010-11-18 20:40:32.857,2010,11,18,20
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,23,-0.882157,2.283499,2010-11-18 20:42:31.513,2010,11,18,20
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,36,-0.564064,3.940821,2010-11-18 20:43:28.903,2010,11,18,20
3,<plot><short-story><planning><brainstorming>,1,2010-11-18T20:43:59.693,34,-0.564064,3.685848,2010-11-18 20:43:59.693,2010,11,18,20
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,22,-0.564064,2.156013,2010-11-18 20:45:44.067,2010,11,18,20


In [8]:
tags = tabular_df["Tags"]
clean_tags = tags.str.split("><").apply(
    lambda x: [a.strip("<").strip(">") for a in x]
)

tag_columns = pd.get_dummies(clean_tags.apply(pd.Series).stack()).sum(level=0)
all_tags = tag_columns.astype(bool).sum(axis=0).sort_values(ascending=False)
top_tags = all_tags[all_tags > 500]
top_tag_columns = tag_columns[top_tags.index]

In [9]:
top_tag_columns.head()

Unnamed: 0,creative-writing,fiction,style,characters,technique,novel,plot,publishing,character-development
0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,1,0
3,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0


In [10]:
final = pd.concat([tabular_df, top_tag_columns], axis=1)
col_to_keep = ["year", "month", "day", "hour", "NormComment", "NormScore"] + list(top_tags.index)
final_features = final[col_to_keep]

In [11]:
final_features.head()

Unnamed: 0,year,month,day,hour,NormComment,NormScore,creative-writing,fiction,style,characters,technique,novel,plot,publishing,character-development
0,2010,11,18,20,1.344496,3.303389,0,0,0,0,0,0,0,0,0
1,2010,11,18,20,-0.882157,2.283499,0,1,0,0,0,0,0,0,0
2,2010,11,18,20,-0.564064,3.940821,0,0,0,0,0,1,0,1,0
3,2010,11,18,20,-0.564064,3.685848,0,0,0,0,0,0,1,0,0
4,2010,11,18,20,-0.564064,2.156013,0,1,0,0,0,0,0,0,0
