### Popcorn 데이터 세트

* 데이터 전처리
* 임베딩(CountVectorize)
* 모델 : RandomForest

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer # countvec 이용
from sklearn.model_selection import train_test_split # 데이터 셋 분할
from sklearn.ensemble import RandomForestClassifier # 모델 Random Forest 사용

### [1] 데이터 세트 불러오기

In [2]:
BASIC_PATH = "C://Users/msi/Desktop/공부/자연어처리/Kaggle/Day2/popcorn/dataset"

In [3]:
DATA_CLEAN_DATA = BASIC_PATH + "/train_clean.csv"

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [4]:
train_data = pd.read_csv(DATA_CLEAN_DATA)

In [5]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [6]:
reviews[100]

'uk edition show rather less extravagant us version person concerned get new kitchen perhaps bedroom bathroom wonderfully grateful got us version show everything reality tv instead making improvements house occupants could afford entire house gets rebuilt know show trying show lousy welfare system exists us beg hard enough receive rather vulgar product placement takes place particularly sears also uncalled rsther turning one family deprived area potential millionaires would far better help community whole instead spending hundreds thousands dollars one home build something whole community perhaps place diy power tools borrowed returned along building materials everyone benefit want giving one person cause enormous resentment among rest local community still live run houses'

### [2] 임베딩(CountVectorizer)

In [7]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 5000)

train_data_features = vectorizer.fit_transform(reviews)

In [8]:
vectorizer.get_feature_names()

['abandoned',
 'abc',
 'abilities',
 'ability',
 'able',
 'abraham',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absurd',
 'abuse',
 'abusive',
 'abysmal',
 'academy',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accident',
 'accidentally',
 'accompanied',
 'accomplished',
 'according',
 'account',
 'accuracy',
 'accurate',
 'accused',
 'achieve',
 'achieved',
 'achievement',
 'acid',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'activities',
 'actor',
 'actors',
 'actress',
 'actresses',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 'adams',
 'adaptation',
 'adaptations',
 'adapted',
 'add',
 'added',
 'adding',
 'addition',
 'adds',
 'adequate',
 'admire',
 'admit',
 'admittedly',
 'adorable',
 'adult',
 'adults',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'adventures',
 'advertising',
 'advice',
 'advise',
 'affair',
 'affect',
 'affected',
 'afford',
 'aforementioned',
 'afraid',
 'africa',
 'african',
 'after

In [21]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf) # 생략 없이 출력
print(train_data_features[0].toarray())
print(len(train_data_features[0].toarray()[0]))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 

In [20]:
X.shape

(25000, 5000)

### [3] 데이터 세트 분할 (Train + Validation)

In [13]:
X = train_data_features
y = np.array(sentiments)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SPLIT, 
                                                  random_state=RANDOM_SEED)

### [4] 모델 선언 및 훈련

In [14]:
forest = RandomForestClassifier(n_estimators=1000) # n_estimators- 몇개의 랜덤 트리를 만들 것인지 설정
forest.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)

### [5] 모델 예측(정확도 측정)

In [15]:
print("Accuracy : %f" % forest.score(X_val, y_val)) # score()함수 => 정확도 측정

Accuracy : 0.856800


### [6] TEST 데이터 세트로 모델 확인(예측)

In [16]:
TEST_CLEAN_DATA = BASIC_PATH + "/test_clean.csv"
test_data = pd.read_csv(TEST_CLEAN_DATA)
test_data.head(5)

Unnamed: 0,review
0,naturally film main themes mortality nostalgia...
1,movie disaster within disaster film full great...
2,movie kids saw tonight child loved one point k...
3,afraid dark left impression several different ...
4,accurate depiction small time mob life filmed ...


In [17]:
testDataVecs = vectorizer.fit_transform(test_data['review'])

In [18]:
test_predicted = forest.predict(testDataVecs)

In [19]:
print(test_predicted)

[0 0 0 ... 0 0 1]
