# 텍스트 분류 (Text Classification)

## 텍스트 전처리

### Tokenization (토큰화)

In [57]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [58]:
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes'
text_sample

'The Matrix is everywhere its all around us, here even in this room.                You can see it out your window or on your television.                You feel it when you go to work, or go to church or pay your taxes'

In [59]:
from nltk import sent_tokenize

sentences = sent_tokenize(text_sample)
sentences

['The Matrix is everywhere its all around us, here even in this room.',
 'You can see it out your window or on your television.',
 'You feel it when you go to work, or go to church or pay your taxes']

In [60]:
from nltk import word_tokenize

words = word_tokenize(text_sample)
words

['The',
 'Matrix',
 'is',
 'everywhere',
 'its',
 'all',
 'around',
 'us',
 ',',
 'here',
 'even',
 'in',
 'this',
 'room',
 '.',
 'You',
 'can',
 'see',
 'it',
 'out',
 'your',
 'window',
 'or',
 'on',
 'your',
 'television',
 '.',
 'You',
 'feel',
 'it',
 'when',
 'you',
 'go',
 'to',
 'work',
 ',',
 'or',
 'go',
 'to',
 'church',
 'or',
 'pay',
 'your',
 'taxes']

In [61]:
li = []

for sentence in sentences:
    li.append(word_tokenize(sentence))
li

[['The',
  'Matrix',
  'is',
  'everywhere',
  'its',
  'all',
  'around',
  'us',
  ',',
  'here',
  'even',
  'in',
  'this',
  'room',
  '.'],
 ['You',
  'can',
  'see',
  'it',
  'out',
  'your',
  'window',
  'or',
  'on',
  'your',
  'television',
  '.'],
 ['You',
  'feel',
  'it',
  'when',
  'you',
  'go',
  'to',
  'work',
  ',',
  'or',
  'go',
  'to',
  'church',
  'or',
  'pay',
  'your',
  'taxes']]

### Stop words

In [62]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [64]:
len(stopwords)

179

In [65]:
#stop words 제거

st_li = []

for word in words:
    if word not in stopwords:
        st_li.append(word)
st_li

['The',
 'Matrix',
 'everywhere',
 'around',
 'us',
 ',',
 'even',
 'room',
 '.',
 'You',
 'see',
 'window',
 'television',
 '.',
 'You',
 'feel',
 'go',
 'work',
 ',',
 'go',
 'church',
 'pay',
 'taxes']

### Stemming & Lemmatization
- Stemming: 가지 제거 (plays -> play)
- Lemmatization: 표제어 추출 (is, was -> be)

In [132]:
from nltk import LancasterStemmer
from nltk import WordNetLemmatizer
#nltk.download('omw-1.4') or nltk.download('wordnet')

In [19]:
stemmer = LancasterStemmer()

print(stemmer.stem('plays'))
print(stemmer.stem('working'))
print(stemmer.stem('happier'))

play
work
happy


In [20]:
lemma = WordNetLemmatizer()

print(lemma.lemmatize('plays', 'v'))
print(lemma.lemmatize('is','v'))
print(lemma.lemmatize('happier','a'))

play
be
happy


## 피처 벡터화 (Bag of Words)

In [53]:
from sklearn.datasets import fetch_20newsgroups

In [54]:
news_data = fetch_20newsgroups(subset='all', random_state=0)
news_data

{'data': ['From: judy@cbnewsd.cb.att.com (judith.diehl)\nSubject: 1945 BLONDIE CARD BOARD JIG SAW PUZZLE FOR SALE\nOrganization: AT&T\nDistribution: usa\nKeywords: puzzle Blondie Dagwood\nLines: 20\n\n\n                       FOR SALE\n\n                 1945 King Feature Syndicate\n                 Jaymar Specialty Company\n                 200 Fifth Avenue New York, NY\n\n                 Cardboard puzzle - NO BOX\n                 Pieces worn from use\n                 NO MISSING PIECES\n                 Size: 13 3/4 inches by 21 1/2 inches\n                 60 Puzzle Pieces\n\n   Puzzle depicts Dagwood, Blondie, the kids, and dog Daisey with her\n   puppies on a picnic with Dagwood and Alexander trying to get\n   a fishing line out of a tree.\n\n                   $10.00 plus Shipping Charges\n                 I can be reached by EMAIL or CALL:\n                   JUDY DIEHL (219) 838-8234\n',
  "From: raunoh@otol.fi (Rauno Haapaniemi)\nSubject: REAL-3D\nNntp-Posting-Host: janus.ot

In [55]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [56]:
import pandas as pd
pd.DataFrame(news_data['target']).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [57]:
news_data['data'][0]

'From: judy@cbnewsd.cb.att.com (judith.diehl)\nSubject: 1945 BLONDIE CARD BOARD JIG SAW PUZZLE FOR SALE\nOrganization: AT&T\nDistribution: usa\nKeywords: puzzle Blondie Dagwood\nLines: 20\n\n\n                       FOR SALE\n\n                 1945 King Feature Syndicate\n                 Jaymar Specialty Company\n                 200 Fifth Avenue New York, NY\n\n                 Cardboard puzzle - NO BOX\n                 Pieces worn from use\n                 NO MISSING PIECES\n                 Size: 13 3/4 inches by 21 1/2 inches\n                 60 Puzzle Pieces\n\n   Puzzle depicts Dagwood, Blondie, the kids, and dog Daisey with her\n   puppies on a picnic with Dagwood and Alexander trying to get\n   a fishing line out of a tree.\n\n                   $10.00 plus Shipping Charges\n                 I can be reached by EMAIL or CALL:\n                   JUDY DIEHL (219) 838-8234\n'

In [62]:
train_news = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'), random_state=0)
X_train = train_news.data
y_train = train_news.target

test_news = fetch_20newsgroups(subset='test', remove=('headers','footers','quotes'), random_state=0)
X_test = test_news.data
y_test = test_news.target

### CountVectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(solver = 'liblinear')
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
accuracy_score(y_test, pred)

0.6167020711630377

### TfidfVectorizer

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect , y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test ,pred)

0.6736590546999469

In [None]:
#stop_words 적용하면 accuracy가 소폭 상승한다
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect , y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test ,pred)

0.6909187466808284

In [24]:
#Pipeline

In [23]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')),
                    ('lr_clf', LogisticRegression(random_state=0))])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
accuracy_score(y_test, pred)

0.6909187466808284

### 희소행렬

#### coo 형식 (Coordinate: 좌표)
: 0이 아닌 데이터들의 row, col 위치 저장하기

In [102]:
from scipy import sparse

In [103]:
dense = np.array([[0,0,1,0,0,5],
                  [1,4,0,3,2,5],
                  [0,6,0,3,0,0],
                  [2,0,0,0,0,0],
                  [0,0,0,7,0,8],
                  [1,0,0,0,0,0]]) 

In [104]:
data = np.array([1,5,1,4,5,2,5,6,3,2,7,8,1])
row = np.array([0,0,1,1,1,1,1,2,2,3,4,4,5])
col = np.array([2,5,0,1,3,4,5,1,3,0,3,5,0])

In [105]:
sparse.coo_matrix((data, (row, col))).toarray()

array([[0, 0, 1, 0, 0, 5],
       [1, 4, 0, 5, 2, 5],
       [0, 6, 0, 3, 0, 0],
       [2, 0, 0, 0, 0, 0],
       [0, 0, 0, 7, 0, 8],
       [1, 0, 0, 0, 0, 0]])

#### csr 형식 (Compressed Sparse Row)
: 행위치 배열 (행 위치의 시작 인덱스만 표기하는 방법)

In [109]:
dense = np.array([[0,0,1,0,0,5],
                  [1,4,0,3,2,5],
                  [0,6,0,3,0,0],
                  [2,0,0,0,0,0],
                  [0,0,0,7,0,8],
                  [1,0,0,0,0,0]]) 

In [114]:
data = np.array([1,5,1,4,5,2,5,6,3,2,7,8,1])
#row = np.array([0,0,1,1,1,1,1,2,2,3,4,4,5])
col = np.array([2,5,0,1,3,4,5,1,3,0,3,5,0])

row_index = np.array([0,2,7,9,10,12,13])

In [115]:
sparse.csr_matrix((data, col, row_index)).toarray()

array([[0, 0, 1, 0, 0, 5],
       [1, 4, 0, 5, 2, 5],
       [0, 6, 0, 3, 0, 0],
       [2, 0, 0, 0, 0, 0],
       [0, 0, 0, 7, 0, 8],
       [1, 0, 0, 0, 0, 0]])

# 감성 분석 (Sentiment Analysis)

## 지도학습 기반 감성 분석

In [28]:
import pandas as pd

In [29]:
review_df = pd.read_csv('ml_data/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
review_df[:3]

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [30]:
import re

In [31]:
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [32]:
review_df['review'] = review_df['review'].str.replace('<br />', " ")
review_df['review'] = review_df['review'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))

In [33]:
review_df['review'][0]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for 

In [34]:
review_df.head(3) 

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,With all this stuff going down at the moment ...
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...


In [35]:
from sklearn.model_selection import train_test_split

feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)
class_df = review_df.drop(['id', 'review'], axis=1, inplace=False)

X_train, X_test, y_train, y_test= train_test_split(feature_df, class_df, test_size=0.3, random_state=156)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17500, 1), (7500, 1), (17500, 1), (7500, 1))

## 비지도학습 기반 감성 분석

### SentiWordNet

In [36]:
import nltk
#nltk.download('all')

In [37]:
from nltk.corpus import wordnet as wn

term = 'present'
synsets = wn.synsets(term)
synsets

[Synset('present.n.01'),
 Synset('present.n.02'),
 Synset('present.n.03'),
 Synset('show.v.01'),
 Synset('present.v.02'),
 Synset('stage.v.01'),
 Synset('present.v.04'),
 Synset('present.v.05'),
 Synset('award.v.01'),
 Synset('give.v.08'),
 Synset('deliver.v.01'),
 Synset('introduce.v.01'),
 Synset('portray.v.04'),
 Synset('confront.v.03'),
 Synset('present.v.12'),
 Synset('salute.v.06'),
 Synset('present.a.01'),
 Synset('present.a.02')]

### Vader

In [47]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])
senti_scores
#neg: 부정 감성 지수, neu: 중립 감성 지수, pos: 긍정 감성 지수, compound: 3개 조합, 0.1이상이면 긍정, 이하면 부정

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}

# 문서 코사인 유사도

- A벡터 * B벡터 = A길이 * B길이 * cosθ
- similarity = cosθ = (A벡터 * B벡터) / (A길이 * B길이)

In [30]:
import numpy as np

v1 = np.array([1,2])
v2 = np.array([2,5])

dot_product = np.dot(v1,v2)
magnitude = np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2)))
similarity = dot_product/magnitude

similarity

0.9965457582448796

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

doc_list = ['if you take the blue pill, the story ends' ,
            'if you take the red pill, you stay in Wonderland',
            'if you take the red pill, I show you how deep the rabbit hole goes']

tfidf_vect = TfidfVectorizer()
doc_tfidf_vect = tfidf_vect.fit_transform(doc_list)

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(doc_tfidf_vect[0], doc_tfidf_vect)

array([[1.        , 0.40207758, 0.40425045]])

# 한글 텍스트 처리 (KoNLPy)

In [38]:
import warnings
warnings.filterwarnings('ignore')

In [39]:
from konlpy.tag import Twitter

twitter = Twitter()
twitter.morphs('우리나라 대한민국')

['우리나라', '대한민국']

In [40]:
from konlpy.tag import Okt

okt = Okt()
okt.morphs('우리나라 대한민국')

['우리나라', '대한민국']

In [41]:
train_df = pd.read_csv('ml_data/naver_movie/ratings_train.txt', sep='\t')
train_df[:2]

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [42]:
import re

train_df = train_df.fillna(' ')
# 정규 표현식을 이용하여 숫자를 공백으로 변경(정규 표현식으로 \d 는 숫자를 의미함.) 
train_df['document'] = train_df['document'].apply(lambda x : re.sub(r"\d+", " ", x) )
train_df.drop('id', axis=1, inplace=True)
train_df[:2]

Unnamed: 0,document,label
0,아 더빙.. 진짜 짜증나네요 목소리,0
1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [31]:
# 테스트 데이터 셋을 로딩하고 동일하게 Null 및 숫자를 공백으로 변환
test_df = pd.read_csv('ml_data/naver_movie/ratings_test.txt', sep='\t')
test_df = test_df.fillna(' ')
test_df['document'] = test_df['document'].apply(lambda x : re.sub(r"\d+", " ", x) )
test_df.drop('id', axis=1, inplace=True) 

In [35]:
#입력 인자로 들어온 text 를 형태소 단어로 토큰화 하여 list 객체 반환

def tw_tokenizer(text):
    tokens_ko = twitter.morphs(text)
    return tokens_ko 

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train_df['document'])
tfidf_matrix_train = tfidf_vect.transform(train_df['document'])
tfidf_matrix_test = tfidf_vect.transform(test_df['document'])

lr = LogisticRegression()
lr.fit(tfidf_matrix_train, train_df['label'])
pred = lr.predict(tfidf_matrix_test)
accuracy_score(test_df['label'], pred)

0.85612