# Week10 Learning from Text Data

---
p6

# 1. Preprocessing texts

### Load the IMDb movie review data 

In [1]:
# movie review data
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [2]:
df.shape

(50000, 2)

---
p7

### Cleaning text data

In [3]:
# cleaning texts using regular expression
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove <...> (tags)
    text = re.sub('[\W]+', ' ', text)  # remove all non-words
    text = text.lower()                # change to lower cases
    return text

preprocessor("</a>This is a $100 TEST!!! ^^")

'this is a 100 test '

---
p8

In [4]:
# review 1 text
df.loc[1, 'review']

"OK... so... I really like Kris Kristofferson and his usual easy going delivery of lines in his movies. Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly. But, Disappearance is his misstep. Holy Moly, this was a bad movie! <br /><br />I must give kudos to the cinematography and and the actors, including Kris, for trying their darndest to make sense from this goofy, confusing story! None of it made sense and Kris probably didn't understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about! <br /><br />I don't care that everyone on this movie was doing out of love for the project, or some such nonsense... I've seen low budget movies that had a plot for goodness sake! This had none, zilcho, nada, zippo, empty of reason... a complete waste of good talent, scenery and celluloid! <br /><br />I rented this piece of garbage for a buck, and I want my money back! I want my 2 hou

In [5]:
# cleaning review 1 text 
preprocessor(df.loc[1, 'review'])

'ok so i really like kris kristofferson and his usual easy going delivery of lines in his movies age has helped him with his soft spoken low energy style and he will steal a scene effortlessly but disappearance is his misstep holy moly this was a bad movie i must give kudos to the cinematography and and the actors including kris for trying their darndest to make sense from this goofy confusing story none of it made sense and kris probably didn t understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about i don t care that everyone on this movie was doing out of love for the project or some such nonsense i ve seen low budget movies that had a plot for goodness sake this had none zilcho nada zippo empty of reason a complete waste of good talent scenery and celluloid i rented this piece of garbage for a buck and i want my money back i want my 2 hours back i invested on this grade f waste of my time don t watch this 

---
p9

### Processing documents into tokens(English)

In [6]:
text = 'The sun is shining, the weather is sweet, and she likes RUNNING!'
print(text)

# cleaning
text_prep = preprocessor(text)
print(text_prep)

The sun is shining, the weather is sweet, and she likes RUNNING!
the sun is shining the weather is sweet and she likes running 


In [7]:
# tokenizing
import nltk
nltk.download('punkt')   # tokenizer

text_tokens = nltk.word_tokenize(text_prep)
print(text_tokens)

['the', 'sun', 'is', 'shining', 'the', 'weather', 'is', 'sweet', 'and', 'she', 'likes', 'running']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
p10

In [8]:
# stemming
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    text_tokens = nltk.word_tokenize(text)
    return [porter.stem(word) for word in text_tokens]

text_stems = tokenizer_porter(text_prep)
print(text_stems)

['the', 'sun', 'is', 'shine', 'the', 'weather', 'is', 'sweet', 'and', 'she', 'like', 'run']


---
p11

In [9]:
# stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# removing stopwords
def remove_stopwords(text):
    return [w for w in text if w not in stop]

text_stems = remove_stopwords(tokenizer_porter(text_prep))
text_stems

['sun', 'shine', 'weather', 'sweet', 'like', 'run']

---
p12

### POS tagging(English)

In [11]:
# POS tagging
from nltk.tag import pos_tag
nltk.download('averaged_perception_tagger') # POS tagger

tagged_text = pos_tag(nltk.word_tokenize(text_prep))
tagged_text

[nltk_data] Error loading averaged_perception_tagger: Package
[nltk_data]     'averaged_perception_tagger' not found in index


[('the', 'DT'),
 ('sun', 'NN'),
 ('is', 'VBZ'),
 ('shining', 'VBG'),
 ('the', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('sweet', 'JJ'),
 ('and', 'CC'),
 ('she', 'PRP'),
 ('likes', 'VBZ'),
 ('running', 'VBG')]

---
p13

### Processing documents into tokens(Korean)

In [12]:
# Korean movie reviews
df_kor = pd.read_csv("kor_movie.csv", encoding='utf-8')
df_kor.head(3)

Unnamed: 0,review,sentiment
0,아 더빙.. 진짜 짜증나네요 목소리,0
1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,너무재밓었다그래서보는것을추천한다,0


In [13]:
df_kor.shape

(200000, 2)

In [14]:
# review 1 text 
df_kor.loc[1,'review']

'흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나'

In [15]:
# cleaning review 1 text 
preprocessor(df_kor.loc[1,'review'])

'흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나'

---
p14

In [16]:
# tokenizing - Okt
from konlpy.tag import Okt
okt = Okt()

text = '하늘을 나는 아름다운 꿈을 꾸었습니다!'

# simple split() method is not appropriate
print('<split() method>')
print(text.split())

print('<Okt word tokenizer>')
print(okt.morphs(text))

<split() method>
['하늘을', '나는', '아름다운', '꿈을', '꾸었습니다!']
<Okt word tokenizer>
['하늘', '을', '나', '는', '아름다운', '꿈', '을', '꾸었습니다', '!']


---
p15

In [17]:
# POS tagging (형태소 분석))
tagged_text = okt.pos(text)
tagged_text

[('하늘', 'Noun'),
 ('을', 'Josa'),
 ('나', 'Noun'),
 ('는', 'Josa'),
 ('아름다운', 'Adjective'),
 ('꿈', 'Noun'),
 ('을', 'Josa'),
 ('꾸었습니다', 'Verb'),
 ('!', 'Punctuation')]

---
p16

In [18]:
# tokenizing - Kkma
from konlpy.tag import Kkma
kkma = Kkma()

print('<Kkma word tokenizer>')
print(kkma.morphs(text))

<Kkma word tokenizer>
['하늘', '을', '날', '는', '아름답', 'ㄴ', '꿈', '을', '꾸', '었', '습니다', '!']


In [19]:
# POS tagging (형태소 분석) - Kkma
tagged_text = kkma.pos(text)
tagged_text

[('하늘', 'NNG'),
 ('을', 'JKO'),
 ('날', 'VV'),
 ('는', 'ETD'),
 ('아름답', 'VA'),
 ('ㄴ', 'ETD'),
 ('꿈', 'NNG'),
 ('을', 'JKO'),
 ('꾸', 'VV'),
 ('었', 'EPT'),
 ('습니다', 'EFN'),
 ('!', 'SF')]

---
p17

In [20]:
# Stemming
def tokenizer_porter_kor(text):
    return okt.morphs(text, norm=True, stem=True)

In [21]:
# tokenizing only
okt.morphs(text)

['하늘', '을', '나', '는', '아름다운', '꿈', '을', '꾸었습니다', '!']

In [22]:
# tokenizing + stemming
tokenizer_porter_kor(text)

['하늘', '을', '나', '는', '아름답다', '꿈', '을', '꾸다', '!']

In [23]:
# nouns only
okt.nouns(text)

['하늘', '나', '꿈']

In [24]:
# 띄어쓰기 오류인 경우도 가능
okt.nouns('아버지가방에들어가신다')

['아버지', '가방']

---
p20

# 2. Vectorization: the bag-of-words model

### Transforming documents into term frequency vectors 

In [25]:
# vectorize texts - Document-Term Matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and she likes RUNNING!'])
bag = count.fit_transform(docs)

In [26]:
# vocabulary
print(count.vocabulary_)

{'the': 8, 'sun': 6, 'is': 1, 'shining': 5, 'weather': 9, 'sweet': 7, 'and': 0, 'she': 4, 'likes': 2, 'running': 3}


In [27]:
# Document-Term Matrix
print(bag.toarray())

[[0 1 0 0 0 1 1 0 1 0]
 [0 1 0 0 0 0 0 1 1 1]
 [1 2 1 1 1 1 1 1 2 1]]


---
p23

### Transforming documents into TF-IDF vectors

In [28]:
np.set_printoptions(precision=2)

In [29]:
# vectorize texts - TF-IDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
docs_vector = tfidf.fit_transform(docs)

In [30]:
# vocabulary
print(tfidf.vocabulary_)

{'the': 8, 'sun': 6, 'is': 1, 'shining': 5, 'weather': 9, 'sweet': 7, 'and': 0, 'she': 4, 'likes': 2, 'running': 3}


In [31]:
# TF-IDF Matrix (normalized)
print(docs_vector.toarray())

[[0.   0.43 0.   0.   0.   0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.   0.   0.   0.56 0.43 0.56]
 [0.33 0.39 0.33 0.33 0.33 0.25 0.25 0.25 0.39 0.25]]


---
p24

In [32]:
# vectorize texts - TF-IDF Matrix (with preprocessing, stemming, stopwords)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,   # preprocessing
                        tokenizer=tokenizer_porter,  # stemming
                        stop_words=stop              # removing stopwords
                       )
docs_vector = tfidf.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [33]:
# vocabulary
print(tfidf.vocabulary_)

{'sun': 3, 'shine': 2, 'weather': 5, 'sweet': 4, 'like': 0, 'run': 1}


In [34]:
# TF-IDF Matrix (normalized)
print(docs_vector.toarray())

[[0.   0.   0.71 0.71 0.   0.  ]
 [0.   0.   0.   0.   0.71 0.71]
 [0.48 0.48 0.37 0.37 0.37 0.37]]


---
p25

In [35]:
# TFIDF example
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and she likes RUNNING!'])
n_docs = 3

# Tf-Idf score of "is" in doc 1
tf = 1
df = 3
idf = np.log((n_docs+1) / (df + 1))
tfidf = tf * (idf + 1)
print('tf-idf of term "is" = %.2f' % tfidf)

# Tf-Idf score of "sun" in doc 1
tf = 1
df = 1
idf = np.log((n_docs+1) / (df+1))
tfidf = tf * (idf + 1)
print('tf-idf of term "sun" = %.2f' % tfidf)

tf-idf of term "is" = 1.00
tf-idf of term "sun" = 1.69


---
p27

# 3. Training a model for document classification

### Load the IMDb movie review data 

In [36]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


---
p28

### Preprocessing

In [37]:
# use 1000 texts for training and test
X_train = df.loc[0:999, 'review'].values
y_train = df.loc[0:999, 'sentiment'].values
X_test = df.loc[49000:, 'review'].values
y_test = df.loc[49000:, 'sentiment'].values

X_train.shape

(1000,)

In [38]:
X_train[0]

'In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich famil

---
p29

In [39]:
# vectorize to TF-IDF Matrix 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,
                        # Below two steps are needed, but it takes long time,
                        # so, We'll skip this processes
                        #tokenizer=tokenizer_porter,
                        #stop_words=stop,
                        max_df=0.1,   # ignore terms occured in more than 10% of docs (stop words)
                       )

X_train_vector = tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [40]:
# automatic stop words
print(tfidf.stop_words_)

{'there', 'but', 'really', 'actually', 'would', 'if', 'why', 'how', 'actors', 'pretty', 'long', 'own', 'us', 'story', 'that', 'he', 'movie', 'is', 'go', 'acting', 'few', 'almost', 'this', 'little', 'good', 'off', 'never', 'watch', 'has', 'find', 'my', 'makes', 'character', 'so', 'got', 'his', 'have', 'not', 'great', 've', 'director', 'scene', 'while', 'around', 'thing', 'nothing', 'between', 'in', 'many', 'do', 'are', 'me', 'know', 'as', 'back', 'bit', 'movies', 'man', 'such', 'things', 'years', 'isn', 'seen', 'she', 'watching', 'big', 'those', 'about', 'up', 'being', 'work', 'we', 'first', 'what', 'from', 'like', 'they', 'every', 'them', 'where', 'could', 'might', 'than', 'though', 'seems', 'its', 'here', 'quite', 'to', 'when', 'ever', '10', 'll', 'whole', 'into', 'all', 'which', 'him', 'anything', 'don', 'be', 'far', 'love', 'your', 'only', 'doesn', 're', 'through', 'new', 'enough', 'must', 'an', 'their', 'will', 'much', 'better', 'scenes', 'should', 'two', 'come', 'it', 'and', 'part

---
p30

In [41]:
# data dimension
X_train_vector = X_train_vector.toarray()
X_test_vector = X_test_vector.toarray()
X_train_vector.shape

(1000, 18452)

---
p31

In [42]:
# vectorize to TF-IDF Matrix - remove rare terms too
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,
                        # Below two steps are needed, but it takes long time,
                        # so, We'll skip this processes
                        #tokenizer=tokenizer_porter,
                        #stop_words=stop,
                        max_df=0.1,   # ignore terms occured in more than 10% of docs (stop words)
                        min_df=10     # ignore terms occured in less than 10 docs
                       )

X_train_vector = tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [43]:
# data dimension
X_train_vector = X_train_vector.toarray()
X_test_vector = X_test_vector.toarray()
X_train_vector.shape

(1000, 1827)

---
p32

In [44]:
# text 0 
print(X_train[0])

In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70's, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich family 

In [45]:
# text 0 
print(X_train_vector[0])

[0. 0. 0. ... 0. 0. 0.]


---
p33

### Logistic Regression

In [46]:
# train using Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', verbose=1)
lr.fit(X_train_vector, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(verbose=1)

In [47]:
# train score
lr.score(X_train_vector, y_train)

0.964

In [48]:
# test score
lr.score(X_test_vector, y_test)

0.811

In [49]:
# sentiment prediction example 
tweets = ["this movie is garbage", 
          "I loved it very much", 
          "what a fantastic film!"]

tweets_tfidf = tfidf.transform(tweets)
lr.predict(tweets_tfidf)

array([0, 1, 1], dtype=int64)

---
p34

### Decision Tree

In [50]:
# train using Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(X_train_vector, y_train)

DecisionTreeClassifier(max_depth=2)

In [51]:
# train score
tree.score(X_train_vector, y_train)

0.623

In [52]:
# test score
tree.score(X_test_vector, y_test)

0.626

---
p35

In [53]:
# finding most important terms
importances = tree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(10):
    print("%2d. %-30s %f" % (f+1, 
                             [w for w, n in tfidf.vocabulary_.items() if n == indices[f]],
                             importances[indices[f]]))

 1. ['worst']                      0.578800
 2. ['waste']                      0.363875
 3. ['hair']                       0.057325
 4. ['feelings']                   0.000000
 5. ['fell']                       0.000000
 6. ['fellow']                     0.000000
 7. ['felt']                       0.000000
 8. ['female']                     0.000000
 9. ['fi']                         0.000000
10. ['fiction']                    0.000000


### Gaussian Naive Bayes

In [54]:
# train using Decision Tree
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train_vector, y_train)

GaussianNB()

In [55]:
# train score
nb.score(X_train_vector, y_train)

0.959

In [56]:
# test score
nb.score(X_test_vector, y_test)

0.736

### KNN
- Not recommended

In [57]:
# train using K-NN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_vector, y_train)

KNeighborsClassifier()

In [58]:
# train score
knn.score(X_train_vector, y_train)

0.81

In [59]:
# test score
knn.score(X_test_vector, y_test)

0.671

### Multi-layer Neural Network

In [60]:
# train using Neural Network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(learning_rate_init=0.01, max_iter=100, verbose=1)
mlp.fit(X_train_vector, y_train)

Iteration 1, loss = 0.65572950
Iteration 2, loss = 0.41854411
Iteration 3, loss = 0.22518655
Iteration 4, loss = 0.11131272
Iteration 5, loss = 0.05041157
Iteration 6, loss = 0.02154352
Iteration 7, loss = 0.01044572
Iteration 8, loss = 0.00586089
Iteration 9, loss = 0.00397530
Iteration 10, loss = 0.00302565
Iteration 11, loss = 0.00249385
Iteration 12, loss = 0.00219331
Iteration 13, loss = 0.00198647
Iteration 14, loss = 0.00184381
Iteration 15, loss = 0.00174265
Iteration 16, loss = 0.00166643
Iteration 17, loss = 0.00160388
Iteration 18, loss = 0.00155267
Iteration 19, loss = 0.00150936
Iteration 20, loss = 0.00147263
Iteration 21, loss = 0.00143810
Iteration 22, loss = 0.00140793
Iteration 23, loss = 0.00138066
Iteration 24, loss = 0.00135524
Iteration 25, loss = 0.00133195
Iteration 26, loss = 0.00131071
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(learning_rate_init=0.01, max_iter=100, verbose=1)

In [61]:
# train score
mlp.score(X_train_vector, y_train)

1.0

In [62]:
# test score
mlp.score(X_test_vector, y_test)

0.772

---
p37

# Quiz : Naver movie review classification(Korean)
- Use movie review dataset "kor_movie.csv"
- class : 0, 1 (neg, pos)
- data size : 200,000 - use first 1,000 texts
- use 70% as training set
1. Preprocess text using Okt to make TFIDF vectors - ignore terms occured in more than 10% of texts
2. Build model using Logistic Regression, Decision Tree, and Neural Network. Check the accuracies
3. Find most important 20 terms using DT

### Read dataset. Use first 1000 texts only

In [63]:
import pandas as pd
import numpy as np

# read dataset
df_kor = pd.read_csv("kor_movie.csv", encoding='utf-8')
df_kor.head(3)

Unnamed: 0,review,sentiment
0,아 더빙.. 진짜 짜증나네요 목소리,0
1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,너무재밓었다그래서보는것을추천한다,0


In [64]:
# use 1000 reviews 
df_sample = df_kor.iloc[:1000]
np.bincount(df_sample.sentiment)

array([508, 492], dtype=int64)

### Get training and test set

In [65]:
# get X and y
X = df_sample["review"].values
y = df_sample['sentiment'].values

# number of data
X.shape

(1000,)

In [66]:
# Split Dataset into 70% train and 30% test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)
X_train.shape

(700,)

### Vectorize using Okt

In [67]:
# function for tokenizing + stemming using Okt
from konlpy.tag import Okt
okt = Okt()

def tokenizer_kor(text):
    return okt.morphs(text, norm=True, stem=True)

In [68]:
# vectorize to TF-IDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        max_df=0.1)

X_train_vector = tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [69]:
# check automatic stop words
print(tfidf.stop_words_)

{'영화'}


In [70]:
# data dimension
X_train_vector = X_train_vector.toarray()
X_test_vector = X_test_vector.toarray()
X_train_vector.shape

(700, 3900)

In [71]:
# text 0 
print(X_train[0])

진짜 잘 만든 수작


In [72]:
# vector of text 0 
print(X_train_vector[0])

[0. 0. 0. ... 0. 0. 0.]


### Logistic Regression

In [73]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1.0, max_iter=100, n_jobs=-1, random_state=42,
                       penalty='l2')
lr.fit(X_train_vector, y_train)

LogisticRegression(n_jobs=-1, random_state=42)

In [74]:
lr.score(X_train_vector, y_train)

0.9957142857142857

In [75]:
lr.score(X_test_vector, y_test)

0.61

### Decision Tree

In [76]:
X_train_vector.shape

(700, 3900)

In [77]:
y_train.shape

(700,)

In [78]:
# train using Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(X_train_vector, y_train)

DecisionTreeClassifier(max_depth=2)

In [79]:
tree.score(X_train_vector, y_train)

0.5357142857142857

In [80]:
tree.score(X_test_vector, y_test)

0.5233333333333333

In [81]:
# finding 20 most important terms
importances = tree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(20):
     print("%2d. %-30s %f" % (f+1, 
                             [w for w, n in tfidf.vocabulary_.items() if n == indices[f]],
                             importances[indices[f]]))

 1. ['최고의']                        0.572225
 2. ['재미있게']                       0.427775
 3. ['힘알이']                        0.000000
 4. ['모르겠네여']                      0.000000
 5. ['모자라진']                       0.000000
 6. ['모아서']                        0.000000
 7. ['모아놓은']                       0.000000
 8. ['모습만']                        0.000000
 9. ['모습도']                        0.000000
10. ['모습내가']                       0.000000
11. ['모를정도로']                      0.000000
12. ['모를까']                        0.000000
13. ['모르는거야']                      0.000000
14. ['모르고']                        0.000000
15. ['모르겠어']                       0.000000
16. ['모르겠다']                       0.000000
17. ['모르겠고']                       0.000000
18. ['명치를']                        0.000000
19. ['모르게']                        0.000000
20. ['모든인물들이']                     0.000000


### Multi-layer Nueral Network

In [82]:
# train using Neural Network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(learning_rate_init=0.01, max_iter=100, verbose=1)
mlp.fit(X_train_vector, y_train)

Iteration 1, loss = 0.69032744
Iteration 2, loss = 0.57465051
Iteration 3, loss = 0.40961648
Iteration 4, loss = 0.23075964
Iteration 5, loss = 0.10696951
Iteration 6, loss = 0.04608996
Iteration 7, loss = 0.02241610
Iteration 8, loss = 0.01324288
Iteration 9, loss = 0.00926550
Iteration 10, loss = 0.00739109
Iteration 11, loss = 0.00614140
Iteration 12, loss = 0.00538447
Iteration 13, loss = 0.00500980
Iteration 14, loss = 0.00478611
Iteration 15, loss = 0.00465900
Iteration 16, loss = 0.00452993
Iteration 17, loss = 0.00446834
Iteration 18, loss = 0.00440069
Iteration 19, loss = 0.00435693
Iteration 20, loss = 0.00433646
Iteration 21, loss = 0.00428524
Iteration 22, loss = 0.00426507
Iteration 23, loss = 0.00425337
Iteration 24, loss = 0.00428405
Iteration 25, loss = 0.00419126
Iteration 26, loss = 0.00415373
Iteration 27, loss = 0.00412079
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(learning_rate_init=0.01, max_iter=100, verbose=1)

In [83]:
mlp.score(X_train_vector, y_train)

0.9985714285714286

In [84]:
mlp.score(X_test_vector, y_test)

0.6233333333333333