사전순 정렬 대신 natural sort를 사용한다.

In [320]:
# cell 1

import re

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]


먼저 파일을 읽어서 카테고리 이름을 요소로 갖는 string list를 만든다.

In [321]:
# cell 2

import os

def readCategory(directory):
    """
    카테고리를 포함하는 폴더의 directory를 입력받아 오름차순으로 정렬 후 카테고리 list를 반환
     ex) readCategory('2022_Fall_Student_Data/8/Corpus/Input_Data')

        args:
            directory`string`: 읽을 파일의 디렉토리
        return:
            category`str list`: 카테고리이름 list
    """
    folder_list = os.listdir(directory)
    category = []
    # 카테고리 string 저장할 list

    for i in folder_list:
        if not i.endswith('.DS_Store'):
            category.append(i)
        # .DS_Store 빼고 category에 추가

    category.sort(key=natural_keys)

    return category


In [322]:

category = readCategory('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Input_Data')
print(category)

['child', 'culture', 'economy', 'education', 'health', 'life', 'person', 'policy', 'society']


각 카테고리 안에 있는 파일 이름을 요소로 갖는 2D string list를 만든다.

In [323]:
# cell 3

def readFileName(directory):
    """
    읽을 파일의 디렉토리를 입력받아 DataFrame으로 반환
     ex) readFileName('2022_Fall_Student_Data/8/Corpus/Input_Data')

        arg:
            directory`string`: 읽을 파일의 디렉토리
        return:
            fileName2D`str 2D list`: 읽은 파일명 저장된 table (row: categoryIdx, col: 파일 이름)
    """

    fileName2D = []
    # row: categoryIdx, column: 해당 카테고리의 파일 이름

    for categoryIdx in range(len(category)):
        path = directory+'/'+category[categoryIdx]
        file_list = os.listdir(path)
        #해당 카테고리 안에 있는 파일명이 담긴 string list

        txt_list = []
        for i in file_list:
            if i.endswith('.txt') and not i.endswith('.DS_Store'):
                #파일 형식이 .txt로 끝나는 파일 이름만 배열에 추가
                txt_list.append(i)

        txt_list.sort(key=natural_keys)
        # natural sort
        fileName2D.append(txt_list)

    # print("카테고리마다의 파일 개수")
    # for i in fileName2D:
    #     print(len(i), end=" ")
    
    return fileName2D


In [324]:
fileName2D = readFileName('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Input_Data')

In [None]:
print(fileName2D)

### DTM 생성
모든 파일의 형태소를 중복 없이 요소로 갖는 DTM을 만든다.

DTM:

row: docNum, 0번 column: category,  나머지 column: term 인 2D Dataframe

빈 DataFrame을 생성했을 때 행과 열이 둘 다 없으면 데이터를 추가하거나 수정할 수 없다.

열 정보를 갖고 있어야 행 데이터를 추가할 수 있기 때문에 첫 column인 category를 추가해준다.

데이터 프레임 값 변경: df.loc[2, 'A'] = 3000

if morp == '':일때 컨티뉴 해주어야하는데 시간상  break로 첫 문단만 가져옴

In [326]:
# cell 4

import pandas as pd
import numpy as np

def makeTrainDTM(directory, category, fileName2D):

    DTM = pd.DataFrame(columns=range(1))
    DTM.columns = ['category']
    # term table

    docNum = 0
    #문서 번호

    for categoryIdx in range(len(category)):

        # 수정 필요
        for fileIdx in range(len(fileName2D[categoryIdx])):
            file = open(directory+'/'+category[categoryIdx]+'/'+fileName2D[categoryIdx][fileIdx], "r")

            lines = file.readlines()
            #파일을 한 줄씩 읽어서 string으로 list에 저장

            docRow = "Doc"+str(docNum)
            #row 이름 ex) Doc3

            DTM.loc[docRow] = [0 for i in range(len(DTM.columns))]
            # 새로운 row를 만들어 0으로 채움
            # ex)Doc3 = [0, 0, 0, 0](열 개수만큼 0을 만들어 list로 반환)

            DTM.loc[docRow, 'category'] = categoryIdx
            # 해당 문서의 카테고리 값에 categoryIdx 저장
            
            for line in lines:
                morp = line.split('\t')[-1].split('+')[0].strip('\n')
                # '\t'를 기준으로 뒤, '+'를 기준으로 앞의 string을 잘라 list에 저장
                
                if morp == '':
                    continue
                # 빈 string이면 무시

                if morp in DTM.columns:
                    DTM.loc[docRow,morp] += 1
                else:
                    DTM[morp] = 0
                    DTM.loc[docRow,morp] += 1
                # DTM columns에서 중복 확인, 중복 있으면 값만 1 증가시킴, 없으면 열 추가 후 값 1 대입 

            docNum += 1
    return DTM
    #28분

In [327]:
DTM = makeTrainDTM('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Input_Data', category, fileName2D)
DTM

Unnamed: 0,category,與/SL,성폭행/NNG,`/SW,거세/NNG,전면확대/NNG,추진/NNG,성범죄/NNG,친고/NNG,폐지/NNG,...,치상/NNG,주요부분/NNG,한현우/NNP,감형하/VV,판결”/NNG,애매하/VA,재판부”/NNG,광주법원/NNP,엄벌’/NNG,해제하/VV
Doc0,0,1,1,7,4,1,3,10,4,2,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1682,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1683,8,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1684,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1685,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### sum값 저장

(category column 에는 0 저장)

In [None]:
from collections import Counter
# TT.loc['sum'] = TT.drop('category', axis=1).sum(axis=0)

DTM.loc['sum'] = DTM.sum(axis=0)
DTM.loc['sum','category'] = 0
DTM


### 카테고리 label을 갖고 있는 (n,1) dataframe 생성

In [329]:
label = pd.DataFrame(DTM['category'].copy())
label = label.iloc[:-1]


In [330]:
label

Unnamed: 0,category
Doc0,0
Doc1,0
Doc2,0
Doc3,0
Doc4,0
...,...
Doc1682,8
Doc1683,8
Doc1684,8
Doc1685,8


### sum 값 기준으로 column 정렬
내림차순으로 정렬 후 1000번째 이후 요소는 제거

In [331]:
DTM = DTM.drop('category', axis=1).sort_values(by='sum', axis= 1, ascending=False)
DTM = DTM.iloc[:,:1000]
# 5000으로 수정

# .loc은 row, column 이름으로, 
# .iloc은 row, column 인덱스로. 
DTM = DTM.drop('sum')

In [332]:
DTM

Unnamed: 0,있/VX,것/NNB,있/VA,장애인/NNG,등/NNB,수/NNB,하/VV,위하/VV,하/VX,되/VV,...,현/MM,적/NNB,올라가/VV,접근하/VV,신체/NNG,뜻/NNG,당부하/VV,가능성/NNG,담당자/NNG,전면/NNG
Doc0,0,9,1,0,5,0,13,1,5,0,...,0,0,0,0,0,0,0,0,0,1
Doc1,0,0,0,0,2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc2,4,2,2,1,2,2,1,2,2,1,...,0,0,0,0,0,0,0,0,0,0
Doc3,1,1,0,0,3,0,0,2,0,1,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,2,1,1,5,0,2,1,1,1,...,0,0,0,0,2,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc1682,1,1,3,3,0,2,2,0,1,1,...,0,1,0,0,0,0,0,0,0,0
Doc1683,1,0,0,3,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1684,7,10,3,2,3,2,4,9,3,0,...,0,1,0,0,0,0,0,0,0,0
Doc1685,2,1,0,0,2,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0


### DF값 저장
DTM을 .copy()로 깊은 복사한 DTM_df 사용

In [333]:
#이거 함수로 만들거임 그리고 df 따로 데이터 프레임에 안넣고 그냥 리스트로 반환할거임

def makeDFList (DTM):
    df_list = []
    colNum = DTM.shape[1]
    #DTM column 개수
    for c in range(colNum):
        df_list.append(len(DTM.loc[DTM.iloc[:,c] != 0]))
        # 모든 column을 다 돌면서 값이 0이 아닌 row가 몇개 있는지 df_list에 저장
    return df_list

df_list = makeDFList(DTM)
print(df_list)
print(len(df_list))

[1286, 1278, 1316, 1062, 1354, 1115, 1041, 1120, 1038, 1033, 945, 934, 856, 831, 844, 702, 971, 751, 793, 775, 773, 707, 466, 645, 722, 805, 685, 718, 820, 441, 618, 570, 521, 633, 423, 301, 768, 538, 512, 533, 646, 483, 400, 471, 479, 427, 547, 415, 403, 450, 233, 391, 442, 314, 455, 119, 403, 370, 414, 386, 468, 401, 261, 460, 145, 352, 388, 378, 181, 197, 163, 54, 486, 371, 431, 336, 425, 225, 186, 210, 244, 262, 235, 378, 249, 358, 111, 292, 256, 263, 325, 213, 371, 294, 291, 378, 304, 330, 311, 276, 305, 281, 296, 310, 293, 143, 217, 320, 305, 319, 392, 314, 186, 218, 302, 161, 405, 177, 320, 267, 281, 271, 241, 273, 128, 296, 294, 241, 238, 263, 98, 209, 285, 243, 123, 216, 150, 24, 273, 238, 273, 311, 245, 203, 266, 139, 173, 196, 249, 216, 122, 205, 182, 225, 227, 171, 176, 269, 264, 95, 163, 169, 240, 227, 121, 212, 193, 119, 128, 170, 189, 202, 195, 208, 219, 188, 217, 191, 216, 112, 215, 211, 193, 193, 127, 196, 99, 193, 168, 184, 76, 272, 126, 202, 131, 222, 184, 217, 134, 

### IDF 함수 정의


In [334]:
import math


def func_IDF(n, df):
    """
        args:
            n`int`: 문서의 총 개수
            df`int`: 행 하나의 df 값
        Retruns:
            idf`float`: 입력 받은 n, df 값으로 계산한 idf 값
    """
    return math.log(n / (1 + df))

def makeTF_IDF(DTM, df_list, mean = 0.0, std = 0.0):

    idf_list = []

    docNum = DTM.shape[0]
    colNum = DTM.shape[1]

    for c in range(colNum):
        idf_list.append(func_IDF(docNum, df_list[c]))
        # 모든 column을 다 돌면서 값이 df 읽고 idf값 list에 추가

    TF_IDF = DTM.copy()
    # TF-IDF 테이블 생성

    for d in range(docNum) :
        for t in range(colNum):
            TF_IDF.iloc[d, t] = TF_IDF.iloc[d, t] * idf_list[t]

    if std  == 0:
        std = TF_IDF.stack().std()
    if mean == 0:
        mean = TF_IDF.stack().mean()
        
    TF_IDF_std = (TF_IDF - mean)/std
    # 정규분포 
    
    return TF_IDF_std, mean, std


In [335]:
TF_IDF_train, mean, std = makeTF_IDF(DTM, df_list)

In [None]:
TF_IDF_train

In [337]:
print(mean, std)

0.2941575027341152 1.6042262273427268


###  category 추가한 후, train 파일 .txt로 저장

In [None]:
TF_IDF_train_label = pd.concat([TF_IDF_train, label],axis = 1)
TF_IDF_train_label


In [339]:

TF_IDF_train_label.to_csv("/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/TF-IDF/all_train_features.txt", sep = '\t',index=False)


### Test 데이터 준비
테스트 디렉토리에 있는 모든 파일 이름 lsit에 저장

In [None]:
testFileName2D = readFileName('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Test_Data')
print(testFileName2D)
print("카테고리마다의 파일 개수")
for i in testFileName2D:
    print(len(i), end=" ")

테스트 데이터 디렉토리의 파일 읽고 Test DTM 생성

In [341]:

def makeTestDTM(directory, category, terms, fileName2D):

    DTM = pd.DataFrame(columns=terms)
    # DTM.columns = ['category']
    # term table

    docNum = 0
    #문서 번호

    for categoryIdx in range(len(category)):
        # for fileIdx in range(len(fileName2D[categoryIdx])):
        #수정 10 -> len(fileName2D[categoryIdx])
        for fileIdx in range(len(fileName2D[categoryIdx])):
            file = open(directory+'/'+category[categoryIdx]+'/'+fileName2D[categoryIdx][fileIdx], "r")

            lines = file.readlines()
            #파일을 한 줄씩 읽어서 string으로 list에 저장

            docRow = "Doc"+str(docNum)
            #row 이름 ex) Doc3

            DTM.loc[docRow] = [0 for i in range(len(DTM.columns))]
            # 새로운 row를 만들어 0으로 채움
            # ex)Doc3 = [0, 0, 0, 0](열 개수만큼 0을 만들어 list로 반환)

            DTM.loc[docRow, 'category'] = categoryIdx
            # 해당 문서의 카테고리 값에 categoryIdx 저장
            
            for line in lines:
                morp = line.split('\t')[-1].split('+')[0].strip('\n')
                # '\t'를 기준으로 뒤, '+'를 기준으로 앞의 string을 잘라 list에 저장
                
                if morp == '':
                    continue
    ##########################################  수정 continue로 바꿔야 함 ########################################
                # 빈 string이면 무시

                if morp in DTM.columns:
                    DTM.loc[docRow,morp] += 1
                # else:
                #     DTM[morp] = 0
                #     DTM.loc[docRow,morp] += 1
                # DTM columns에서 중복 확인, 중복 있으면 값만 1 증가시킴, 없으면 열 추가 후 값 1 대입 

            docNum += 1
    return DTM

In [342]:
TestDTM = makeTestDTM('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/Corpus/Test_Data', category, TF_IDF_train.columns, testFileName2D)

In [343]:
TestDTM

Unnamed: 0,있/VX,것/NNB,있/VA,장애인/NNG,등/NNB,수/NNB,하/VV,위하/VV,하/VX,되/VV,...,적/NNB,올라가/VV,접근하/VV,신체/NNG,뜻/NNG,당부하/VV,가능성/NNG,담당자/NNG,전면/NNG,category
Doc0,15,3,5,1,4,3,5,6,1,4,...,0,0,0,0,0,0,0,0,0,0.0
Doc1,10,7,2,0,0,1,7,0,2,1,...,0,0,0,1,0,0,0,0,0,0.0
Doc2,0,2,2,0,0,1,0,1,2,1,...,0,0,0,0,0,0,0,0,0,0.0
Doc3,1,1,1,0,5,1,1,2,1,0,...,0,0,0,0,0,0,0,0,0,0.0
Doc4,2,9,2,0,5,4,1,4,1,0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc95,2,1,1,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,8.0
Doc96,8,7,8,7,14,4,1,8,8,4,...,1,1,0,0,0,0,0,0,0,8.0
Doc97,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8.0
Doc98,0,1,2,0,7,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,8.0


### Test TF-IDF 계산 후 .txt로 저장


df, idf 계산 후 TF-IDF 생성

In [344]:
target = pd.DataFrame(TestDTM['category'].copy())
TestDTM = TestDTM.iloc[:,:-1]
# idf 계산 위해 category 잠시 분리

df_list = makeDFList(TestDTM)
TF_IDF_test,_,_ = makeTF_IDF(TestDTM, df_list, mean, std)

TF_IDF_test_target = pd.concat([TF_IDF_test, target],axis = 1)
# idf 계산 후 category col 다시 추가

TF_IDF_test_target


Unnamed: 0,있/VX,것/NNB,있/VA,장애인/NNG,등/NNB,수/NNB,하/VV,위하/VV,하/VX,되/VV,...,적/NNB,올라가/VV,접근하/VV,신체/NNG,뜻/NNG,당부하/VV,가능성/NNG,담당자/NNG,전면/NNG,category
Doc0,1.672212,0.233928,0.286716,0.178068,0.1927,0.510547,1.207609,1.31447,0.145538,1.090336,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
Doc1,1.053687,0.790317,0.004668,-0.183364,-0.183364,0.04794,1.763998,-0.183364,0.474439,0.135061,...,-0.183364,-0.183364,-0.183364,2.002461,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
Doc2,-0.183364,0.094831,0.004668,-0.183364,-0.183364,0.04794,-0.183364,0.066275,0.474439,0.135061,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
Doc3,-0.059659,-0.044267,-0.089348,-0.183364,0.286716,0.04794,0.094831,0.315914,0.145538,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
Doc4,0.064046,1.068512,0.004668,-0.183364,0.286716,0.741851,0.094831,0.815192,0.145538,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc95,0.064046,-0.044267,-0.089348,-0.183364,-0.183364,-0.183364,0.65122,-0.183364,-0.183364,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0
Doc96,0.806277,0.790317,0.568764,2.346659,1.13286,0.741851,0.094831,1.813749,2.44785,1.090336,...,2.002461,1.823133,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0
Doc97,-0.059659,-0.183364,-0.089348,-0.183364,-0.089348,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0
Doc98,-0.183364,-0.044267,0.004668,-0.183364,0.474748,-0.183364,0.373025,-0.183364,0.145538,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0


In [345]:
TF_IDF_test_target.to_csv("/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/TF-IDF/all_test_features.txt", sep = '\t', index=False)



### 벡터 읽어 오기

In [346]:
train = pd.read_csv('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/TF-IDF/all_train_features.txt', sep='\t')
test = pd.read_csv('/Users/jeongdeok/Downloads/2022_Fall_Student_Data/8/TF-IDF/all_test_features.txt', sep='\t')
# 트레인, 테스트에 사용할 카테고리 포함한 데이터프레임

In [347]:
train_data = train.iloc[:,:-1]
train_target = train.iloc[:,-1]
test_data = test.iloc[:,:-1]
test_target = test.iloc[:,-1]

In [None]:
train

In [350]:
test

Unnamed: 0,있/VX,것/NNB,있/VA,장애인/NNG,등/NNB,수/NNB,하/VV,위하/VV,하/VX,되/VV,...,적/NNB,올라가/VV,접근하/VV,신체/NNG,뜻/NNG,당부하/VV,가능성/NNG,담당자/NNG,전면/NNG,category
0,1.672212,0.233928,0.286716,0.178068,0.192700,0.510547,1.207609,1.314470,0.145538,1.090336,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
1,1.053687,0.790317,0.004668,-0.183364,-0.183364,0.047940,1.763998,-0.183364,0.474439,0.135061,...,-0.183364,-0.183364,-0.183364,2.002461,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
2,-0.183364,0.094831,0.004668,-0.183364,-0.183364,0.047940,-0.183364,0.066275,0.474439,0.135061,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
3,-0.059659,-0.044267,-0.089348,-0.183364,0.286716,0.047940,0.094831,0.315914,0.145538,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
4,0.064046,1.068512,0.004668,-0.183364,0.286716,0.741851,0.094831,0.815192,0.145538,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.064046,-0.044267,-0.089348,-0.183364,-0.183364,-0.183364,0.651220,-0.183364,-0.183364,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0
96,0.806277,0.790317,0.568764,2.346659,1.132860,0.741851,0.094831,1.813749,2.447850,1.090336,...,2.002461,1.823133,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0
97,-0.059659,-0.183364,-0.089348,-0.183364,-0.089348,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0
98,-0.183364,-0.044267,0.004668,-0.183364,0.474748,-0.183364,0.373025,-0.183364,0.145538,-0.183364,...,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,-0.183364,8.0


### SVM 모델 생성, 학습

In [362]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': [1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1] }

# 0.72
# {'C': 20, 'gamma': 0.0009}

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(train_data, train_target)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]},
             return_train_score=True)

### confusion matrix 작성

In [374]:
from sklearn.metrics import confusion_matrix
confusionMatrix = pd.DataFrame(confusion_matrix(test_target, grid_search.predict(test_data)))
confusionMatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,8,0,1,0,0,0,0,0,1
1,0,9,0,0,0,0,0,0,1
2,0,2,7,1,0,0,0,0,0
3,0,0,1,6,0,0,0,0,3
4,1,0,0,0,4,0,3,2,0
5,0,0,0,0,0,9,1,0,0
6,1,0,0,0,4,0,3,2,0
7,0,0,0,0,1,0,0,12,2
8,0,1,0,0,0,1,0,0,13


#### 성능평가

In [379]:
from sklearn.metrics import precision_score, recall_score, f1_score

pred = grid_search.predict(test_data)

print("precision\t",precision_score(test_target, pred, average= "macro"))
print("recall\t", recall_score(test_target, pred, average= "macro") )
print("f1\t", f1_score(test_target, pred, average= "macro"))


precision	 0.7064373897707231
recall	 0.6962962962962964
f1	 0.6946611972977697
