In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
# Scikit
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import *

In [3]:
# NLTK
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet as wn, sentiwordnet as swn
from nltk.stem import WordNetLemmatizer

In [4]:
review_df = pd.read_csv('./Data/popcorn/labeledTrainData.tsv', header = 0, sep='\t', quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [5]:
review_df['review'] = review_df['review'].str.replace('<br />', ' ')
review_df['review'] = review_df['review'].apply(lambda x: re.sub("[^a-zA-Z]",' ', x))

In [6]:
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,With all this stuff going down at the moment ...
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...


In [7]:
# 학습용 테스트 세트 분리
class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.3, random_state=156)
x_train.shape, x_test.shape

((17500, 1), (7500, 1))

In [9]:
# 파이프 라인 구성
lr_clf_pip = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C= 10))
])

In [10]:
lr_clf_pip.fit(x_train['review'], y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('cnt_vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr_clf',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
               

In [11]:
pred = lr_clf_pip.predict(x_test['review'])

In [12]:
pred_proba = lr_clf_pip.predict_proba(x_test['review'])[:, 1]

In [13]:
print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_proba)))

예측 정확도는 0.8860, ROC-AUC는 0.9503


In [14]:
lr_clf_tfi_pip = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

In [15]:
lr_clf_tfi_pip.fit(x_train['review'], y_train)

Pipeline(memory=None,
         steps=[('tfidf_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr_clf',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, inter

In [16]:
pred = lr_clf_tfi_pip.predict(x_test['review'])

In [17]:
pred_proba = lr_clf_tfi_pip.predict_proba(x_test['review'])[:, 1]

In [18]:
print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_proba)))

예측 정확도는 0.8936, ROC-AUC는 0.9598


In [19]:
# 파이프라인을 안 쓸때 ↓

In [20]:
cvect = CountVectorizer(stop_words='english', ngram_range=(1, 2))
x_train_cvect = cvect.fit_transform(x_train['review'], y_train)

In [21]:
lr_reg = LogisticRegression(C=10)

In [22]:
lr_reg.fit(x_train_cvect, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
x_test_cvect = cvect.transform(x_test['review'])

In [24]:
pred = lr_reg.predict(x_test_cvect)
pred_proba = lr_reg.predict_proba(x_test_cvect)[:, 1]

In [25]:
accuracy_score(y_test, pred)

0.886

In [26]:
roc_auc_score(y_test, pred_proba)

0.9502703875483725

In [27]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [28]:
x_train_tfidf = tfidf.fit_transform(x_train['review'], y_train)
x_test_tfidf = tfidf.transform(x_test['review'])

In [29]:
lr_reg.fit(x_train_tfidf, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
pred = lr_reg.predict(x_test_tfidf)
pred_proba = lr_reg.predict_proba(x_test_tfidf)[:, 1]

In [31]:
accuracy_score(y_test, pred)

0.8936

In [32]:
roc_auc_score(y_test, pred_proba)

0.959799823582973