In [3]:
import pandas as pd

review_df = pd.read_csv('./dataset/labeledTrainData.tsv',header=0,sep='\t',quoting=3)
review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
print(review_df)

              id  sentiment                                             review
0       "5814_8"          1  "With all this stuff going down at the moment ...
1       "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2       "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3       "3630_4"          0  "It must be assumed that those who praised thi...
4       "9495_8"          1  "Superbly trashy and wondrously unpretentious ...
...          ...        ...                                                ...
24995   "3453_3"          0  "It seems like more consideration has gone int...
24996   "5064_1"          0  "I don't believe they made this film. Complete...
24997  "10905_3"          0  "Guy is a loser. Can't get girls, needs to bui...
24998  "10194_3"          0  "This 30 minute documentary Buñuel made in the...
24999   "8478_8"          1  "I saw this movie as a child and it broke my h...

[25000 rows x 3 columns]


# 상기 데이터로 감성분석을 수행
- tfidf, pipeline 사용
- stop_words = 'english', ngram_range=(1,2), C=10
- 평가는 accuracy, roc_auc

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import re

# <br> 태그를 공백으로, 영어 아닌 문자는 삭제
review_df['review'] = review_df['review'].str.replace('<br />',' ') 
review_df['review'] = review_df['review'].apply( lambda x : re.sub("[^a-zA-Z]", " ", x) )

X = review_df.drop(['id','sentiment'], axis=1, inplace=False)
y = review_df['sentiment']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=156)

pipeline = Pipeline([
    ('cnt_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10))])

# Pipeline 객체를 이용하여 학습 및 예측 후 평가
pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_proba = pipeline.predict_proba(X_test['review'])[:,1]

print('accuracy: {0:.3f}, ROC-AUC: {1:.3f}'.format(accuracy_score(y_test ,pred), roc_auc_score(y_test, pred_proba)))

accuracy: 0.894, ROC-AUC: 0.960
