In [1]:
import numpy as np
import pandas as pd
import re
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import warnings

RANDOM_SEED = 42
TEST_SPLIT = 0.2

warnings.filterwarnings(action='ignore')

DATA_IN_PATH = '../../../big_data_sample/pbl/with_soa/'
DF_INPUT_DATA = 'train_clean.csv'
DF_TEST_DATA = 'test_clean.csv'
DF_TEST_LABEL = 'test_clean_label.csv'

In [2]:
from sklearn.model_selection import train_test_split
train_data = pd.read_csv(DATA_IN_PATH+ DF_INPUT_DATA, encoding='cp949',  names=['comments', 'label'], header=None)
test_data = pd.read_csv(DATA_IN_PATH+ DF_TEST_DATA, encoding= 'cp949', names=['comments'],header=None)
test_label = pd.read_csv(DATA_IN_PATH+ DF_TEST_LABEL, encoding= 'cp949', names=['label'],header=None)
print(test_label)

      label
0         1
1         1
2         1
3         1
4         1
...     ...
8240      0
8241      0
8242      0
8243      0
8244      0

[8245 rows x 1 columns]


In [3]:
X_train = list(train_data['comments'])
y_train = list(train_data['label'])
y_test = list(train_data['label'])
y_test_np = np.array(y_test)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vec_cv = CountVectorizer(analyzer = "word", max_features = 5000) 

X_cv = vec_cv.fit_transform(X_train)

X_cv.shape

(8245, 5000)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_tf = TfidfVectorizer(min_df = 0.0, analyzer = 'char', sublinear_tf=True, 
    ngram_range = (1,3), max_features=5000)


X_tf= vec_tf.fit_transform(X_train)
Y_train = np.array(y_train)


X_tf.shape

(8245, 5000)

In [10]:
vec_tf.get_feature_names()

[' ',
 " '",
 " 'ㄷ",
 " 'ㄹ",
 " 'ㅁ",
 " 'ㅂ",
 " 'ㅅ",
 " 'ㅆ",
 " 'ㅇ",
 " 'ㅈ",
 " 'ㅉ",
 " 'ㅋ",
 " 'ㅎ",
 " 'ㅠ",
 " 'ㅡ",
 " '가",
 " '각",
 " '간",
 " '갈",
 " '감",
 " '갑",
 " '강",
 " '갖",
 " '같",
 " '개",
 " '걍",
 " '거",
 " '걱",
 " '건",
 " '걸",
 " '검",
 " '게",
 " '결",
 " '경",
 " '계",
 " '고",
 " '곧",
 " '곳",
 " '공",
 " '과",
 " '관",
 " '광",
 " '괜",
 " '교",
 " '구",
 " '국",
 " '군",
 " '궁",
 " '권",
 " '귀",
 " '규",
 " '그",
 " '극",
 " '근",
 " '글",
 " '금",
 " '급",
 " '기",
 " '길",
 " '김",
 " '까",
 " '깔",
 " '깜",
 " '깨",
 " '꺼",
 " '께",
 " '꼬",
 " '꼭",
 " '꼴",
 " '꾸",
 " '끄",
 " '끝",
 " '끼",
 " '나",
 " '난",
 " '날",
 " '남",
 " '낫",
 " '낮",
 " '내",
 " '냐",
 " '너",
 " '넌",
 " '넘",
 " '넣",
 " '네",
 " '녀",
 " '년",
 " '노",
 " '논",
 " '놀",
 " '놈",
 " '높",
 " '놓",
 " '뇌",
 " '누",
 " '눈",
 " '뉴",
 " '느",
 " '늘",
 " '능",
 " '늦",
 " '니",
 " '님",
 " '다",
 " '단",
 " '달",
 " '담",
 " '답",
 " '당",
 " '대",
 " '댓",
 " '더",
 " '데",
 " '도",
 " '독",
 " '돈",
 " '돌",
 " '동",
 " '돼",
 " '되",
 " '두",
 " '둘",
 " '뒤",
 " '뒷",
 " 

In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# KFold & StratifiedKFold의 객체 선언
kf = KFold(n_splits=10)
skfold = StratifiedKFold(n_splits=10)

# LogisticRegression의 객체 선언
# lgs = LogisticRegression(class_weight = 'balanced', random_state=13)
lgs = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', random_state=13)

In [14]:
from sklearn.model_selection import cross_val_score

lg_tf_acc = cross_val_score(lgs, X_tf, y_train, scoring=None, cv=skfold)
lg_cv_acc = cross_val_score(lgs, X_cv, y_train, scoring=None, cv=skfold)

In [15]:
print("StratifiedKFold :: TF Acc : ", np.mean(lg_tf_acc), ", CV Acc : ", np.mean(lg_cv_acc))

StratifiedKFold :: TF Acc :  0.9644652839070315 , CV Acc :  0.9428752574286554


In [17]:
from sklearn.model_selection import cross_validate

cross_validate(lgs, X_tf, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.33716416, 0.39036608, 0.4361999 , 0.33490133, 0.32593608,
        0.46288252, 0.44714999, 0.51864028, 0.46286583, 0.43544888]),
 'score_time': array([0.        , 0.        , 0.00099754, 0.00102687, 0.00099754,
        0.        , 0.        , 0.        , 0.00099754, 0.        ]),
 'test_score': array([0.98545455, 0.97090909, 0.97575758, 0.92242424, 0.95151515,
        0.96480583, 0.97936893, 0.96966019, 0.96359223, 0.96116505]),
 'train_score': array([0.9861186 , 0.9861186 , 0.98692722, 0.98800539, 0.98706199,
        0.98598572, 0.98638997, 0.98638997, 0.98760275, 0.98665948])}

In [25]:
lgs.fit(X_tf, y_train)
test_tf = vec_tf.transform(test_data['comments'])
pred_test = lgs.predict(test_tf)
test_data['label'] = pred_test
print("Accuracy: %f" %lgs.score(y_test_np, pred_test))

ValueError: Expected 2D array, got 1D array instead:
array=[1 1 1 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [22]:
test_data.to_csv('211106_test.csv', index=False, encoding='cp949')