### Popcorn 데이터 세트

* 데이터 전처리
* 임베딩(tf-idf)
* 모델 : Logistic Regression

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf
from sklearn.model_selection import train_test_split # 데이터 셋 분할
from sklearn.linear_model import LogisticRegression # 모델 회귀 모형 이용

### [1] 데이터 세트 불러오기

In [2]:
BASIC_PATH = "C://Users/msi/Desktop/공부/자연어처리/Kaggle/Day2/popcorn/dataset"

In [4]:
DATA_CLEAN_DATA = BASIC_PATH + "/train_clean.csv"

In [5]:
train_data = pd.read_csv(DATA_CLEAN_DATA)

In [6]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [7]:
reviews[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

### [2] 임베딩(TF-IDF) 

In [8]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="word", sublinear_tf=True,
                            ngram_range=(1,3), max_features=5000)

X = vectorizer.fit_transform(reviews) # 입력 값 -> 워드 임베딩
y = np.array(sentiments) # 실제 결과

In [10]:
vectorizer.get_feature_names()

['abandoned',
 'abc',
 'abilities',
 'ability',
 'able',
 'absence',
 'absolute',
 'absolutely',
 'absolutely nothing',
 'absurd',
 'abuse',
 'abysmal',
 'academy',
 'academy award',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'accepted',
 'accident',
 'accidentally',
 'accomplished',
 'according',
 'account',
 'accurate',
 'accused',
 'achieve',
 'achieved',
 'achievement',
 'across',
 'act',
 'act like',
 'acted',
 'acting',
 'acting bad',
 'acting good',
 'action',
 'action film',
 'action movie',
 'action scenes',
 'action sequences',
 'actions',
 'actor',
 'actors',
 'actors actresses',
 'actress',
 'actresses',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 'adams',
 'adaptation',
 'adapted',
 'add',
 'added',
 'adding',
 'addition',
 'adds',
 'adequate',
 'admire',
 'admit',
 'admittedly',
 'adorable',
 'adult',
 'adults',
 'advance',
 'advantage',
 'adventure',
 'adventures',
 'advice',
 'affair',
 'affected',
 'afford',
 'aforementioned',
 'afraid',
 'africa',
 'afric

In [33]:
#np.set_printoptions(threshold=np.inf, linewidth=np.inf) # 생략 없이 출력
print(X.toarray()[0])

[0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.07184848 0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.         0.       

In [11]:
vectorizer.vocabulary_

{'stuff': 4245,
 'going': 1852,
 'moment': 2819,
 'started': 4155,
 'listening': 2527,
 'music': 2946,
 'watching': 4800,
 'odd': 3087,
 'documentary': 1166,
 'watched': 4796,
 'maybe': 2716,
 'want': 4765,
 'get': 1811,
 'certain': 635,
 'insight': 2216,
 'guy': 1951,
 'thought': 4450,
 'really': 3557,
 'cool': 869,
 'eighties': 1271,
 'make': 2641,
 'mind': 2783,
 'whether': 4852,
 'guilty': 1947,
 'innocent': 2212,
 'part': 3215,
 'feature': 1530,
 'film': 1570,
 'remember': 3626,
 'see': 3857,
 'cinema': 709,
 'originally': 3168,
 'released': 3614,
 'subtle': 4262,
 'messages': 2754,
 'feeling': 1539,
 'towards': 4536,
 'press': 3393,
 'also': 122,
 'obvious': 3078,
 'message': 2753,
 'drugs': 1219,
 'bad': 289,
 'visually': 4740,
 'impressive': 2180,
 'course': 904,
 'michael': 2761,
 'jackson': 2276,
 'unless': 4666,
 'remotely': 3633,
 'like': 2493,
 'anyway': 183,
 'hate': 1998,
 'find': 1636,
 'boring': 455,
 'may': 2714,
 'call': 556,
 'making': 2656,
 'movie': 2854,
 'fans':

In [13]:
print(X.shape)

(25000, 5000)


In [18]:
X[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### [3] 데이터 세트 분할 (Train + Validation)

In [19]:
RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SPLIT, 
                                                    random_state=RANDOM_SEED)

### [4] 모델 선언 및 훈련

In [20]:
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

### [5] 모델 예측(정확도 측정)

In [21]:
predicted = lgs.predict(X_val)

In [22]:
print("Accuracy : %f" % lgs.score(X_val, y_val)) # score()함수 => 정확도 측정

Accuracy : 0.887200


### [6] TEST 데이터 세트로 모델 확인(예측)

In [23]:
TEST_CLEAN_DATA = BASIC_PATH + "/test_clean.csv"
test_data = pd.read_csv(TEST_CLEAN_DATA)

In [24]:
test_data.head()

Unnamed: 0,review
0,naturally film main themes mortality nostalgia...
1,movie disaster within disaster film full great...
2,movie kids saw tonight child loved one point k...
3,afraid dark left impression several different ...
4,accurate depiction small time mob life filmed ...


In [25]:
testDataVecs = vectorizer.fit_transform(test_data['review'])

In [26]:
test_predicted = lgs.predict(testDataVecs)

In [27]:
print(test_predicted)

[1 0 0 ... 0 0 0]
