사전순 정렬 대신 natural sort를 사용한다.

In [225]:
# cell 1

import re

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]


먼저 파일을 읽어서 카테고리 이름을 요소로 갖는 string list를 만든다.

In [226]:
# cell 2

import os

def readCategory(directory):
    """
    카테고리를 포함하는 폴더의 directory를 입력받아 오름차순으로 정렬 후 카테고리 list를 반환
     ex) readCategory('2022_Fall_Student_Data/8/Corpus/Input_Data')

        args:
            directory`string`: 읽을 파일의 디렉토리
        return:
            category`str list`: 카테고리이름 list
    """
    folder_list = os.listdir(directory)
    category = []
    # 카테고리 string 저장할 list

    for i in folder_list:
        if not i.endswith('.DS_Store'):
            category.append(i)
        # .DS_Store 빼고 category에 추가

    category.sort(key=natural_keys)

    return category


In [227]:

category = readCategory('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Input_Data')
print(category)

['child', 'culture', 'economy', 'education', 'health', 'life', 'person', 'policy', 'society']


각 카테고리 안에 있는 파일 이름을 요소로 갖는 2D string list를 만든다.

In [228]:
# cell 3

def readFileName(directory):
    """
    읽을 파일의 디렉토리를 입력받아 DataFrame으로 반환
     ex) readFileName('2022_Fall_Student_Data/8/Corpus/Input_Data')

        arg:
            directory`string`: 읽을 파일의 디렉토리
        return:
            fileName2D`DataFrame`: 읽은 파일명 저장된 table (row: categoryIdx, col: 파일 이름)
    """

    fileName2D = []
    # row: categoryIdx, column: 해당 카테고리의 파일 이름

    for categoryIdx in range(len(category)):
        path = directory+'/'+category[categoryIdx]
        file_list = os.listdir(path)
        #해당 카테고리 안에 있는 파일명이 담긴 string list

        txt_list = []
        for i in file_list:
            if i.endswith('.txt') and not i.endswith('.DS_Store'):
                #파일 형식이 .txt로 끝나는 파일 이름만 배열에 추가
                txt_list.append(i)

        txt_list.sort(key=natural_keys)
        # natural sort
        fileName2D.append(txt_list)

    # print("카테고리마다의 파일 개수")
    # for i in fileName2D:
    #     print(len(i), end=" ")
    
    return fileName2D


In [229]:
fileName2D = readFileName('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Input_Data')

In [230]:
print(fileName2D)

[['8_(POS)child_1.txt', '8_(POS)child_2.txt', '8_(POS)child_3.txt', '8_(POS)child_4.txt', '8_(POS)child_5.txt', '8_(POS)child_6.txt', '8_(POS)child_7.txt', '8_(POS)child_8.txt', '8_(POS)child_9.txt', '8_(POS)child_10.txt', '8_(POS)child_11.txt', '8_(POS)child_12.txt', '8_(POS)child_13.txt', '8_(POS)child_14.txt', '8_(POS)child_15.txt', '8_(POS)child_16.txt', '8_(POS)child_17.txt', '8_(POS)child_18.txt', '8_(POS)child_19.txt', '8_(POS)child_20.txt', '8_(POS)child_21.txt', '8_(POS)child_22.txt', '8_(POS)child_23.txt', '8_(POS)child_24.txt', '8_(POS)child_25.txt', '8_(POS)child_26.txt', '8_(POS)child_27.txt', '8_(POS)child_28.txt', '8_(POS)child_29.txt', '8_(POS)child_30.txt', '8_(POS)child_31.txt', '8_(POS)child_32.txt', '8_(POS)child_33.txt', '8_(POS)child_34.txt', '8_(POS)child_35.txt', '8_(POS)child_36.txt', '8_(POS)child_37.txt', '8_(POS)child_38.txt', '8_(POS)child_39.txt', '8_(POS)child_40.txt', '8_(POS)child_41.txt', '8_(POS)child_42.txt', '8_(POS)child_43.txt', '8_(POS)child_44.t

### DTM 생성
모든 파일의 형태소를 중복 없이 요소로 갖는 DTM을 만든다.

DTM:

row: docNum, 0번 column: category,  나머지 column: term 인 2D Dataframe

빈 DataFrame을 생성했을 때 행과 열이 둘 다 없으면 데이터를 추가하거나 수정할 수 없다.

열 정보를 갖고 있어야 행 데이터를 추가할 수 있기 때문에 첫 column인 category를 추가해준다.

데이터 프레임 값 변경: df.loc[2, 'A'] = 3000

if morp == '':일때 컨티뉴 해주어야하는데 시간상  break로 첫 문단만 가져옴

In [231]:
# cell 4

import pandas as pd
import numpy as np

def makeTrainDTM(directory, category, fileName2D):

    DTM = pd.DataFrame(columns=range(1))
    DTM.columns = ['category']
    # term table

    docNum = 0
    #문서 번호

    for categoryIdx in range(len(category)):
        for fileIdx in range(len(fileName2D[categoryIdx])):
            file = open(directory+'/'+category[categoryIdx]+'/'+fileName2D[categoryIdx][fileIdx], "r")

            lines = file.readlines()
            #파일을 한 줄씩 읽어서 string으로 list에 저장

            docRow = "Doc"+str(docNum)
            #row 이름 ex) Doc3

            DTM.loc[docRow] = [0 for i in range(len(DTM.columns))]
            # 새로운 row를 만들어 0으로 채움
            # ex)Doc3 = [0, 0, 0, 0](열 개수만큼 0을 만들어 list로 반환)

            DTM.loc[docRow, 'category'] = categoryIdx
            # 해당 문서의 카테고리 값에 categoryIdx 저장
            
            for line in lines:
                morp = line.split('\t')[-1].split('+')[0].strip('\n')
                # '\t'를 기준으로 뒤, '+'를 기준으로 앞의 string을 잘라 list에 저장
                
                if morp == '':
                    break
    ########################################## continue로 바꿔야 함 ########################################
                # 빈 string이면 무시

                if morp in DTM.columns:
                    DTM.loc[docRow,morp] += 1
                else:
                    DTM[morp] = 0
                    DTM.loc[docRow,morp] += 1
                # DTM columns에서 중복 확인, 중복 있으면 값만 1 증가시킴, 없으면 열 추가 후 값 1 대입 

            docNum += 1
    return DTM

In [232]:
DTM = makeTrainDTM('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Input_Data', category, fileName2D)
DTM

Unnamed: 0,category,與/SL,성폭행/NNG,`/SW,거세/NNG,전면확대/NNG,추진/NNG,‘/SN,지원’/NNG,위하/VV,...,싸움/NNG,승소/NNG,스웨덴/NNP,43/SN,달러/NNB,국고보조금/NNG,농아인협회/NNG,광주인화학교/NNP,행정실장/NNG,감형/NNG
Doc0,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1682,8,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
Doc1683,8,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
Doc1684,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1685,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


### sum값 저장

(category column 에는 0 저장)

In [233]:
from collections import Counter
# TT.loc['sum'] = TT.drop('category', axis=1).sum(axis=0)

DTM.loc['sum'] = DTM.sum(axis=0)
DTM.loc['sum','category'] = 0
DTM


Unnamed: 0,category,與/SL,성폭행/NNG,`/SW,거세/NNG,전면확대/NNG,추진/NNG,‘/SN,지원’/NNG,위하/VV,...,싸움/NNG,승소/NNG,스웨덴/NNP,43/SN,달러/NNB,국고보조금/NNG,농아인협회/NNG,광주인화학교/NNP,행정실장/NNG,감형/NNG
Doc0,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1683,8,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
Doc1684,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1685,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
Doc1686,8,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


### 카테고리 label을 갖고 있는 (n,1) dataframe 생성

In [234]:
label = pd.DataFrame(DTM['category'].copy())
label = label.iloc[:-1]


In [235]:
label

Unnamed: 0,category
Doc0,0
Doc1,0
Doc2,0
Doc3,0
Doc4,0
...,...
Doc1682,8
Doc1683,8
Doc1684,8
Doc1685,8


### sum 값 기준으로 column 정렬
내림차순으로 정렬 후 1000번째 이후 요소는 제거

In [236]:
DTM = DTM.drop('category', axis=1).sort_values(by='sum', axis= 1, ascending=False)
DTM = DTM.iloc[:,:1000]
# .loc은 row, column 이름으로, 
# .iloc은 row, column 인덱스로. 
DTM = DTM.drop('sum')

In [237]:
DTM

Unnamed: 0,'/SS,‘/SL,‘/SN,장애인/NNG,"""/SS",“/SL,지원/NNG,위하/VV,내년/NNG,서울시/NNP,...,접근성/NNG,못하/VV,원전/NNG,죽음/NNG,농아인/NNG,보호견/NNG,걱정/NNG,기부/NNG,최대행사/NNG,한인/NNG
Doc0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1682,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1683,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Doc1684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### DF값 저장
DTM을 .copy()로 깊은 복사한 DTM_df 사용

In [238]:
#이거 함수로 만들거임 그리고 df 따로 데이터 프레임에 안넣고 그냥 리스트로 반환할거임

def makeDFList (DTM):
    df_list = []
    colNum = DTM.shape[1]
    #DTM column 개수
    for c in range(colNum):
        df_list.append(len(sum_remove.loc[sum_remove.iloc[:,c] != 0]))
        # 모든 column을 다 돌면서 값이 0이 아닌 row가 몇개 있는지 df_list에 저장
    return df_list

df_list = makeDFList(DTM)
print(df_list)
print(len(df_list))

[1559, 4, 25, 11, 3, 2, 23, 180, 2, 39, 2, 2, 29, 2, 6, 2, 289, 2, 25, 132, 4, 2, 2, 4, 12, 24, 5, 4, 4, 4, 14, 2, 17, 2, 2, 1, 27, 1, 185, 17, 19, 9, 2, 2, 8, 7, 1, 1, 2, 31, 8, 2, 4, 1, 1, 1, 1, 11, 2, 1, 1, 1, 1, 4, 32, 2, 2, 2, 15, 4, 5, 2, 19, 5, 3, 2, 3, 6, 4, 2, 2, 3, 4, 2, 210, 2, 2, 11, 6, 4, 2, 4, 2, 7, 13, 12, 2, 2, 10, 4, 4, 22, 2, 7, 2, 2, 5, 7, 40, 4, 4, 1, 2, 1, 2, 28, 14, 4, 2, 2, 2, 1, 4, 1, 15, 1, 1, 7, 22, 2, 5, 3, 12, 2, 2, 9, 4, 15, 2, 1, 12, 1, 1, 4, 1, 5, 2, 1, 1, 11, 1, 14, 2, 4, 1, 1, 6, 9, 2, 1, 21, 2, 2, 30, 8, 5, 8, 2, 4, 5, 38, 2, 10, 5, 6, 9, 9, 3, 2, 4, 3, 1, 1, 3, 2, 4, 2, 1, 2, 5, 2, 1, 2, 1, 21, 1, 1, 8, 2, 1, 3, 2, 13, 2, 6, 2, 6, 1, 4, 1, 1, 4, 3, 2, 2, 5, 10, 1, 2, 2, 2, 2, 10, 4, 2, 2, 2, 2, 4, 4, 4, 22, 1, 9, 3, 1, 1, 1, 2, 2, 11, 2, 8, 3, 2, 2, 2, 3, 4, 2, 2, 1, 4, 1, 3, 1, 1, 1, 1, 6, 2, 2, 4, 13, 1, 1, 2, 4, 5, 1, 6, 3, 38, 5, 37, 1, 26, 1, 2, 1, 1, 1, 6, 2, 2, 2, 2, 1, 1, 1, 1, 8, 1, 4, 3, 1, 2, 2, 3, 2, 1, 2, 1, 1, 1, 3, 17, 7, 2, 12, 3, 2, 1

### IDF 함수 정의


In [239]:
import math


def func_IDF(n, df):
    """
        args:
            n`int`: 문서의 총 개수
            df`int`: 행 하나의 df 값
        Retruns:
            idf`float`: 입력 받은 n, df 값으로 계산한 idf 값
    """
    return math.log(n / (1 + df))

def makeTF_IDF(DTM, df_list, mean = 0.0, std = 0.0):

    idf_list = []

    docNum = DTM.shape[0]
    colNum = DTM.shape[1]

    for c in range(colNum):
        idf_list.append(func_IDF(docNum, df_list[c]))
        # 모든 column을 다 돌면서 값이 df 읽고 idf값 list에 추가

    TF_IDF = DTM.copy()
    # TF-IDF 테이블 생성

    for d in range(docNum) :
        for t in range(colNum):
            TF_IDF.iloc[d, t] = TF_IDF.iloc[d, t] * idf_list[t]

    if std  == 0:
        std = TF_IDF.stack().std()
    if mean == 0:
        mean = TF_IDF.stack().mean()
        
    TF_IDF_std = (TF_IDF - mean)/std
    # 정규분포 
    
    return TF_IDF_std, mean, std


In [240]:
TF_IDF_train, mean, std = makeTF_IDF(DTM, df_list)

In [241]:
TF_IDF_train

Unnamed: 0,'/SS,‘/SL,‘/SN,장애인/NNG,"""/SS",“/SL,지원/NNG,위하/VV,내년/NNG,서울시/NNP,...,접근성/NNG,못하/VV,원전/NNG,죽음/NNG,농아인/NNG,보호견/NNG,걱정/NNG,기부/NNG,최대행사/NNG,한인/NNG
Doc0,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
Doc1,-0.057202,-0.057202,12.145769,-0.057202,-0.057202,-0.057202,-0.057202,6.470988,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
Doc2,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
Doc3,0.171690,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
Doc4,-0.057202,-0.057202,-0.057202,-0.057202,17.619932,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1682,-0.057202,-0.057202,-0.057202,14.406995,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
Doc1683,-0.057202,-0.057202,-0.057202,14.406995,-0.057202,-0.057202,12.379857,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,19.647069,-0.057202,-0.057202
Doc1684,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202
Doc1685,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202


In [242]:
print(mean, std)

0.01955918797785522 0.3419339907046435


###  테스트 파일 벡터 추출해서 저장

In [245]:
TF_IDF_with_label = pd.concat([TF_IDF_train, label],axis = 1)
TF_IDF_with_label


Unnamed: 0,'/SS,‘/SL,‘/SN,장애인/NNG,"""/SS",“/SL,지원/NNG,위하/VV,내년/NNG,서울시/NNP,...,못하/VV,원전/NNG,죽음/NNG,농아인/NNG,보호견/NNG,걱정/NNG,기부/NNG,최대행사/NNG,한인/NNG,category
Doc0,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,0
Doc1,-0.057202,-0.057202,12.145769,-0.057202,-0.057202,-0.057202,-0.057202,6.470988,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,0
Doc2,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,0
Doc3,0.171690,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,0
Doc4,-0.057202,-0.057202,-0.057202,-0.057202,17.619932,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1682,-0.057202,-0.057202,-0.057202,14.406995,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,8
Doc1683,-0.057202,-0.057202,-0.057202,14.406995,-0.057202,-0.057202,12.379857,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,19.647069,-0.057202,-0.057202,8
Doc1684,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,8
Doc1685,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,...,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,-0.057202,8


In [246]:

TF_IDF_with_label.to_csv("/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/TF-IDF/all_train_features.txt", sep = '\t', header= None, index=False)


### 평가 데이터 벡터로 준비
테스트 디렉토리에 있는 모든 파일 이름 lsit에 저장

In [247]:
testFileName2D = readFileName('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Test_Data')
print(testFileName2D)

[['8_(POS)child_129.txt', '8_(POS)child_130.txt', '8_(POS)child_131.txt', '8_(POS)child_132.txt', '8_(POS)child_133.txt', '8_(POS)child_134.txt', '8_(POS)child_135.txt', '8_(POS)child_136.txt', '8_(POS)child_137.txt', '8_(POS)child_138.txt'], ['8_(POS)culture_220.txt', '8_(POS)culture_221.txt', '8_(POS)culture_222.txt', '8_(POS)culture_223.txt', '8_(POS)culture_224.txt', '8_(POS)culture_225.txt', '8_(POS)culture_226.txt', '8_(POS)culture_227.txt', '8_(POS)culture_228.txt', '8_(POS)culture_229.txt'], ['8_(POS)economy_167.txt', '8_(POS)economy_168.txt', '8_(POS)economy_169.txt', '8_(POS)economy_170.txt', '8_(POS)economy_171.txt', '8_(POS)economy_172.txt', '8_(POS)economy_173.txt', '8_(POS)economy_174.txt', '8_(POS)economy_175.txt', '8_(POS)economy_176.txt'], ['8_(POS)education_121.txt', '8_(POS)education_122.txt', '8_(POS)education_123.txt', '8_(POS)education_124.txt', '8_(POS)education_125.txt', '8_(POS)education_126.txt', '8_(POS)education_127.txt', '8_(POS)education_128.txt', '8_(POS)

테스트 데이터 DTM 채우기

In [248]:
def makeTestDTM(directory, category, fileName2D):

    DTM = pd.DataFrame(columns=range(1))
    DTM.columns = ['category']
    # term table

    docNum = 0
    #문서 번호

    for categoryIdx in range(len(category)):
        for fileIdx in range(len(fileName2D[categoryIdx])):
            file = open(directory+'/'+category[categoryIdx]+'/'+fileName2D[categoryIdx][fileIdx], "r")

            lines = file.readlines()
            #파일을 한 줄씩 읽어서 string으로 list에 저장

            docRow = "Doc"+str(docNum)
            #row 이름 ex) Doc3

            DTM.loc[docRow] = [0 for i in range(len(DTM.columns))]
            # 새로운 row를 만들어 0으로 채움
            # ex)Doc3 = [0, 0, 0, 0](열 개수만큼 0을 만들어 list로 반환)

            DTM.loc[docRow, 'category'] = categoryIdx
            # 해당 문서의 카테고리 값에 categoryIdx 저장
            
            for line in lines:
                morp = line.split('\t')[-1].split('+')[0].strip('\n')
                # '\t'를 기준으로 뒤, '+'를 기준으로 앞의 string을 잘라 list에 저장
                
                if morp == '':
                    break
    ########################################## continue로 바꿔야 함 ########################################
                # 빈 string이면 무시

                if morp in DTM.columns:
                    DTM.loc[docRow,morp] += 1
                else:
                    DTM[morp] = 0
                    DTM.loc[docRow,morp] += 1
                # DTM columns에서 중복 확인, 중복 있으면 값만 1 증가시킴, 없으면 열 추가 후 값 1 대입 

            docNum += 1
    return DTM

In [249]:
TestDTM = makeTestDTM('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Test_Data', category, testFileName2D)

In [250]:
TestDTM

Unnamed: 0,category,도가니/NNG,피해자/NNG,증인/NNG,재/XPN,요구/NNG,‘/SL,손가락/NNG,없/VA,체조선수/NNG,...,징역/NNG,20/SN,지적장애인/NNG,인권유린/NNG,사범/NNG,무더기/NNG,시골/NNG,살/VV,이동하/VV,수단/NNG
Doc0,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc95,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc96,8,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc97,8,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
Doc98,8,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


### target을 갖고 있는 (n,1) dataframe 생성

In [252]:
target = pd.DataFrame(TestDTM['category'].copy())
target


Unnamed: 0,category
Doc0,0
Doc1,0
Doc2,0
Doc3,0
Doc4,0
...,...
Doc95,8
Doc96,8
Doc97,8
Doc98,8


df, idf 계산 후 TF-IDF 생성

In [256]:
# TT.loc['df'] = [0 for i in range(len(TT.columns))]

df_list = makeDFList(TestDTM)
TF_IDF_test,_,_ = makeTF_IDF(TestDTM, df_list, mean, std)

In [257]:
TF_IDF_test.to_csv("/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/TF-IDF/all_test_features.txt", sep = '\t', header= None, index=False)


### SVM 모델 import