In [1]:
import gensim
from gensim import models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# 파일 입출력 함수 선언

# 파일 Write
def write_text(doc_arr, filename):    # win/lose_filename 형태
    # string형태로 합치기
    samsung_str = '\n'.join(doc_arr[0])
    kia_str = '\n'.join(doc_arr[1])
    lotte_str = '\n'.join(doc_arr[2])
    # 파일로 쓰기
    with open(f"../../datasets/kbo_corpus/samsung_{filename}.txt", 'w') as fw:
        fw.write(samsung_str)
    with open(f"../../datasets/kbo_corpus/kia_{filename}.txt", 'w') as fw:
        fw.write(kia_str)
    with open(f"../../datasets/kbo_corpus/lotte_{filename}.txt", 'w') as fw:
        fw.write(lotte_str)

# 파일 Read
def read_text(filename):
    results = []
    with open(f"../../datasets/kbo_corpus/samsung_{filename}.txt", 'r') as fr:
        samsung = fr.read()
    results.append(samsung.split('\n'))
    with open(f"../../datasets/kbo_corpus/kia_{filename}.txt", 'r') as fr:
        kia = fr.read()
    results.append(kia.split('\n'))
    with open(f"../../datasets/kbo_corpus/lotte_{filename}.txt", 'r') as fr:
        lotte = fr.read()
    results.append(lotte.split('\n'))
    return results

In [3]:
cleaned_documents = read_text('cleaned')

In [4]:
cleaned_documents[1][:10]

['정해영/NNP 올림픽/NNG 가다/VV 한심/NNG 사토/NNG 털릴준비하다/VV',
 '작년/NNG 황대/NNG 기회/NNG 안준거/NNP 진짜/MAG 아깝/VA',
 '작년/NNG 머인/NNG 군무대/NNG 처음/NNG 타석이상/NNG 한/NNG 것/NNB',
 '이민우/NNP 데뷔승/NNG 이후/NNG 승/NNG',
 '야구장/NNG 가다/VV 라다/VV 케텍스/NNP 한참/NNG 가다/VV 야되다/VV 않다/VV',
 '소형준/NNG 신인/NNG 시즌/NNG 경기/NNG 방어/NNG 점/NNB 대/NNG',
 '올해/NNG 쓰다/VV 않다/VV 거/NNB 씹민상/NNG 왜/MAG 썻/NNG',
 '근데/MAJ 설다/VV 프로필/NNG 존나/NNG 사/NNG 잘/MAG 찍다/VV 주다/VV',
 '황대/NNG 페/NNG 이/NNP 스북/NNG',
 '그러다/VV 황대인/NNG 점점/MAG 공/NNG 맞히다/VV']

In [7]:
# <1> 라벨을 생성해서 X, y에 넣기
# 이 때 y값은 라벨 인코딩을 해 준다. (0 : 삼성, 1 : 기아, 2 : 롯데)
def load_data(cleaned_documents):
    X, y = [], []
    for i in range(3):
        for sen in cleaned_documents[i]:
            sentence = []
            for word in sen.split():
                lemma = word.split('/')[0] + ' '
                sentence.append(lemma.strip())
            X.append(sentence)
            y.append(i)
    
    return X, y

In [8]:
X_ori, y_ori = load_data(cleaned_documents)
X, y = [], []
for i, sen in enumerate(X_ori):
    if i % 2 == 0:
        X.append(sen)
        y.append(y_ori[i])

len(X)

28281

In [9]:
# train과 test 데이터 수집
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
# DTM 만들기
def make_DTM(X_train, X_test):
    # 사전 작성
    dictionary = gensim.corpora.Dictionary(X_train)

    #train
    corpus = [dictionary.doc2bow(sen) for sen in X_train]
    tfidf_model = models.TfidfModel(dictionary=dictionary)
    tfidf_corpus = tfidf_model[corpus]
    num_words = len(dictionary)
    num_sen = len(X_train)
    X_train_dtm = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=num_words, num_docs=num_sen).T

    # test
    corpus_test = [dictionary.doc2bow(sen) for sen in X_test]
    tfidf_corpus_test = tfidf_model[corpus_test]
    num_sen_test = len(X_test)
    X_test_dtm = gensim.matutils.corpus2dense(tfidf_corpus_test, num_terms=num_words, num_docs=num_sen_test).T
    
    return X_train_dtm, X_test_dtm

In [11]:
X_train_dtm, X_test_dtm = make_DTM(X_train, X_test)

In [12]:
def fit_and_predict(X_train_dtm, X_test_dtm, y_train, y_test):
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
    clf.fit(X_train_dtm, y_train)
    y_pred = clf.predict(X_test_dtm)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    return clf, y_pred

In [13]:
clf, y_pred = fit_and_predict(X_train_dtm, X_test_dtm, y_train, y_test)

              precision    recall  f1-score   support

           0       0.65      0.66      0.66      2521
           1       0.72      0.48      0.58      1760
           2       0.64      0.76      0.70      2790

    accuracy                           0.66      7071
   macro avg       0.67      0.64      0.64      7071
weighted avg       0.66      0.66      0.65      7071

[[1674  168  679]
 [ 399  851  510]
 [ 503  161 2126]]


In [21]:
id2label = {0 : '삼성', 1 : 'KIA', 2 : '롯데'}
for i in range(6984, 6994):
    print(f'X : {X_test[i]}')
    print(f'real_y : {id2label[y_test[i]]}')
    print(f'pred_y : {id2label[y_pred[i]]}')

X : ['균안', '키', '박다', 'ㄹㅇ', '보다', '싶다']
real_y : 롯데
pred_y : 롯데
X : ['욕', '먹다', '싫', '마무리', '스트', '꽂다', 'ㅋㅋㅋ']
real_y : 롯데
pred_y : 롯데
X : ['근데', '진짜', '허윤동', '평속', '따리', '기대', '안', '됨']
real_y : 삼성
pred_y : 삼성
X : ['강민호', '박해민', '백정현', '금액', '얼마', '예상하다']
real_y : 삼성
pred_y : 삼성
X : ['드', '랲', '진', '짜', '야', '잘잘', '드랲이네']
real_y : 롯데
pred_y : 삼성
X : ['추재', '현', '새끼', '넘다']
real_y : 롯데
pred_y : 롯데
X : ['안치홍', '팔수', '있다']
real_y : 롯데
pred_y : 롯데
X : ['세웅', '이', '승리못', '주다', '팀', '너무', '원망스럽다']
real_y : 롯데
pred_y : 롯데
X : ['이의리', '회', '올리다', '하다']
real_y : KIA
pred_y : KIA
X : ['며칠전', '갸']
real_y : 롯데
pred_y : KIA
