In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/DataCollection/tensorflow-ml-nlp-tf2/4.TEXT_CLASSIFICATION')

In [4]:
os.getcwd()

'/content/drive/MyDrive/DataCollection/tensorflow-ml-nlp-tf2/4.TEXT_CLASSIFICATION'

## 4.1.4 Logistic Regression Example with TF-IDF

### TF-IDF Feature Example

In [5]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
DATA_IN_PATH = './data_in/' 
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [7]:
train_data = pd.read_csv( DATA_IN_PATH + TRAIN_CLEAN_DATA )

In [8]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [23]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) 
#min_df는 설정한 값보다 특정 토큰으 df 값이 더 적게 나오면 벡터화 과정에서 제거
#analyzer: word - 단어 / char - 문자
#sublinear_tf: tf(term frequency)에 대한 smoothing 여부
X = vectorizer.fit_transform(reviews) # numpy 데이터가 아닌 text 데이터를 그대로 사용해야한다.
y = np.array(sentiments)

In [24]:
X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17862871 stored elements in Compressed Sparse Row format>

In [25]:
features = vectorizer.get_feature_names()

In [26]:
features

[' ',
 ' a',
 ' aa',
 ' ab',
 ' ac',
 ' ad',
 ' ae',
 ' af',
 ' ag',
 ' ah',
 ' ai',
 ' ak',
 ' al',
 ' am',
 ' an',
 ' ap',
 ' ar',
 ' as',
 ' at',
 ' au',
 ' av',
 ' aw',
 ' ax',
 ' az',
 ' b',
 ' b ',
 ' ba',
 ' bb',
 ' be',
 ' bi',
 ' bl',
 ' bo',
 ' br',
 ' bu',
 ' by',
 ' c',
 ' c ',
 ' ca',
 ' ce',
 ' cg',
 ' ch',
 ' ci',
 ' cl',
 ' co',
 ' cr',
 ' cu',
 ' cy',
 ' d',
 ' da',
 ' de',
 ' di',
 ' do',
 ' dr',
 ' du',
 ' dv',
 ' dw',
 ' dy',
 ' e',
 ' e ',
 ' ea',
 ' eb',
 ' ec',
 ' ed',
 ' ee',
 ' ef',
 ' eg',
 ' ei',
 ' el',
 ' em',
 ' en',
 ' ep',
 ' eq',
 ' er',
 ' es',
 ' et',
 ' eu',
 ' ev',
 ' ex',
 ' ey',
 ' f',
 ' f ',
 ' fa',
 ' fb',
 ' fe',
 ' fi',
 ' fl',
 ' fo',
 ' fr',
 ' fu',
 ' fx',
 ' g',
 ' g ',
 ' ga',
 ' ge',
 ' gh',
 ' gi',
 ' gl',
 ' go',
 ' gr',
 ' gu',
 ' gw',
 ' gy',
 ' h',
 ' h ',
 ' ha',
 ' hb',
 ' he',
 ' hi',
 ' hm',
 ' ho',
 ' hu',
 ' hy',
 ' i',
 ' ia',
 ' ic',
 ' id',
 ' ig',
 ' ii',
 ' il',
 ' im',
 ' in',
 ' ir',
 ' is',
 ' it',
 ' iv',
 ' j',
 ' j

In [28]:
len(features)  ## equal to max_features

5000

In [29]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [30]:
lgs = LogisticRegression(class_weight='balanced')  # 'balanced': 각 라벨에 대해 균형 있게 학습.
lgs.fit(X_train, y_train) 

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
predicted = lgs.predict(X_eval)

In [32]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.859800


In [33]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [34]:
testDataVecs = vectorizer.transform(test_data['review'])  #DO NOT USE "fit_transform" function. It is 'test' data!

In [35]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [38]:
test_data.head()

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""


In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)