# 텍스트 마이닝 기본문법

## 문장 토큰화
- 파이썬 머신러닝 완벽 가이드 (p.492)

In [1]:
from nltk import sent_tokenize
import nltk

nltk.download('punkt') # 마침표, 개행 문자 관련 데이터 세트를 다운로드 한다.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'

sentences = sent_tokenize(text=text_sample)
print(type(sentences))
print(len(sentences))
print(sentences)

<class 'list'>
3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


## 단어 토큰화
- 문장을 각각의 단어로 다시 토큰화함

In [3]:
from nltk import word_tokenize

sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)
print(type(words))
print(len(words))
print(words)

<class 'list'>
15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


## 문서를 단어로 토큰화 함수
- 문서 -> 문장 -> 단어 ---> 문서를 단어 묶음으로 만들자

In [4]:
from nltk import word_tokenize, sent_tokenize

# 여러개의 문장으로 된 입력 데이터를 문장별로 단어 토큰화
def tokenize_text(text):

  # 문장별로 분리 토큰
  sentences = sent_tokenize(text)
  # 분리된 문장별 단어 토큰화
  word_tokens = [word_tokenize(sentence) for sentence in sentences]
  return word_tokens

In [5]:
text_sample = 'Wednesday’s episode occurred when a reporter asked the Republican leader if he was planning to run for reelection in 2026. McConnell had to ask him to repeat the question several times, chuckled for a moment, and then paused, closing his mouth and staring straight ahead.'
word_tokens = tokenize_text(text_sample)
print(type(word_tokens),len(word_tokens))
print(word_tokens)

<class 'list'> 2
[['Wednesday', '’', 's', 'episode', 'occurred', 'when', 'a', 'reporter', 'asked', 'the', 'Republican', 'leader', 'if', 'he', 'was', 'planning', 'to', 'run', 'for', 'reelection', 'in', '2026', '.'], ['McConnell', 'had', 'to', 'ask', 'him', 'to', 'repeat', 'the', 'question', 'several', 'times', ',', 'chuckled', 'for', 'a', 'moment', ',', 'and', 'then', 'paused', ',', 'closing', 'his', 'mouth', 'and', 'staring', 'straight', 'ahead', '.']]


In [6]:
text_sample = 'LS일렉트릭이 사상 처음 상반기 매출 2조원 고지를 밟았다. 최대 실적을 올린 지난해와 비교해 매출, 영업이익이 괄목할 성장세를 기록, 2년연속 최대 실적 달성에 청신호가 커졌다. 미국을 중심으로 한 해외 사업이 고속성장을 주도했다.'
word_tokens = tokenize_text(text_sample)
print(type(word_tokens),len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['LS일렉트릭이', '사상', '처음', '상반기', '매출', '2조원', '고지를', '밟았다', '.'], ['최대', '실적을', '올린', '지난해와', '비교해', '매출', ',', '영업이익이', '괄목할', '성장세를', '기록', ',', '2년연속', '최대', '실적', '달성에', '청신호가', '커졌다', '.'], ['미국을', '중심으로', '한', '해외', '사업이', '고속성장을', '주도했다', '.']]


## Stopwords 제거
- 불용어 : 분석에 큰 의미가 없는 단어
- p495

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
print('영어 불용어 갯수 :', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english'))

영어 불용어 갯수 : 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',

In [9]:
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'
word_tokens = tokenize_text(text_sample)

stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords + ['everywhere', 'us'] # 특정 도메인에서 분석을 하려고 할때 기존 불용어 리스트에 단어 추가해 줌

all_tokens = []
# 위 예제의 3개의 문장별로 얻은 word_tokens list 에 대해 stop word 제거 Loop
for sentence in word_tokens:
    filtered_words=[]
    # 개별 문장별로 tokenize된 sentence list에 대해 stop word 제거 Loop
    for word in sentence:
        #소문자로 모두 변환합니다.
        word = word.lower()
        # tokenize 된 개별 word가 stop words 들의 단어에 포함되지 않으면 word_tokens에 추가
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens)

[['matrix', 'around', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


## 어근 추출

In [10]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'),stemmer.stem('works'),stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('amuses'),stemmer.stem('amused'))
print(stemmer.stem('happier'),stemmer.stem('happiest'))
print(stemmer.stem('fancier'),stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [11]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing','v'),lemma.lemmatize('amuses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('happier','a'),lemma.lemmatize('happiest','a'))
print(lemma.lemmatize('fancier','a'),lemma.lemmatize('fanciest','a'))

[nltk_data] Downloading package wordnet to /root/nltk_data...


amuse amuse amuse
happy happy
fancy fancy


## 데이터 불러오기

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import pandas as pd
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/text_mining/'

review_df = pd.read_csv(DATA_PATH + './labeledTrainData.tsv', header=0, sep='\t', quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [14]:
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

## 텍스트 전처리

In [15]:
import re
review_df['review'] = review_df['review'].str.replace('<br />', ' ')
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.  Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.  The actual feature film bit when it finally starts is only on f

In [16]:
# 영어 문자열이 아닌 모든 문자는 공백으로 변환
review_df['review'] = review_df['review'].apply( lambda x : re.sub("[^a-zA-Z]", " ", x))
review_df['review'][0]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for 

## 훈련데이터, 테스트 데이터 분리

In [17]:
from sklearn.model_selection import train_test_split
class_df = review_df['sentiment'] # 종속변수 Y : 1은 긍정, 0은 부정
feature_df = review_df.drop(['id','sentiment'], axis=1, inplace=False) # 독립변수로 review컬럼만 남기고 드랍

X_train, X_test, y_train, y_test= train_test_split(
    feature_df, class_df, test_size=0.3, random_state=156
    )

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17500, 1), (7500, 1), (17500,), (7500,))

## 모델 학습

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(solver='liblinear', C=10))
])

pipeline.fit(X_train['review'], y_train)

In [19]:
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test ,pred),
                                         roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8861, ROC-AUC는 0.9503


## TF_IDF 활용해서 모델 학습

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('cnt_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(solver='liblinear', C=10))
])

pipeline.fit(X_train['review'], y_train)

In [21]:
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test ,pred),
                                         roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8936, ROC-AUC는 0.9598


In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(review_df['review'])
X.shape

(25000, 73246)

In [23]:
print(X[0])

  (0, 36529)	0.03974445613837398
  (0, 30273)	0.029526979255486613
  (0, 37217)	0.06590362078693349
  (0, 58569)	0.06757019202956438
  (0, 62320)	0.029251478175035667
  (0, 22137)	0.03185330362188128
  (0, 19924)	0.0276704029457669
  (0, 22262)	0.023374351702627583
  (0, 18459)	0.047793724857004616
  (0, 11758)	0.049576865336142084
  (0, 5517)	0.030517729801371844
  (0, 17121)	0.02651783613037876
  (0, 5208)	0.011724497192971445
  (0, 9203)	0.014599330248345857
  (0, 18346)	0.017544618988878206
  (0, 29861)	0.05388336330754227
  (0, 62417)	0.03510381181138195
  (0, 25678)	0.030709837573016825
  (0, 3772)	0.03282939992448652
  (0, 71002)	0.032850278942684885
  (0, 48796)	0.039996102193618065
  (0, 27026)	0.0413997879418719
  (0, 21498)	0.019784715573515087
  (0, 63836)	0.03609765306311357
  (0, 66844)	0.02832350596796642
  :	:
  (0, 42364)	0.12388113500668683
  (0, 71838)	0.066690516955207
  (0, 70699)	0.05283792624318449
  (0, 64904)	0.01372025884752695
  (0, 2147)	0.07722617055311431


In [24]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(review_df['review'])
X.shape

(25000, 73246)

In [25]:
print(X[0])

  (0, 71786)	4
  (0, 1558)	4
  (0, 65032)	11
  (0, 62272)	1
  (0, 26679)	3
  (0, 18635)	1
  (0, 3639)	2
  (0, 64811)	19
  (0, 42129)	1
  (0, 41901)	11
  (0, 69374)	2
  (0, 61392)	1
  (0, 37621)	1
  (0, 65592)	9
  (0, 29773)	3
  (0, 43124)	2
  (0, 70705)	1
  (0, 45213)	1
  (0, 18164)	1
  (0, 29351)	1
  (0, 2147)	10
  (0, 64904)	1
  (0, 70699)	2
  (0, 71838)	1
  (0, 42364)	2
  :	:
  (0, 66844)	1
  (0, 63836)	1
  (0, 21498)	1
  (0, 27026)	1
  (0, 48796)	1
  (0, 71002)	2
  (0, 3772)	1
  (0, 25678)	1
  (0, 62417)	1
  (0, 29861)	1
  (0, 18346)	1
  (0, 9203)	1
  (0, 5208)	1
  (0, 17121)	1
  (0, 5517)	1
  (0, 11758)	1
  (0, 18459)	1
  (0, 22262)	1
  (0, 19924)	1
  (0, 22137)	1
  (0, 62320)	1
  (0, 58569)	1
  (0, 37217)	1
  (0, 30273)	1
  (0, 36529)	1


## 캐글 Mercari Price Suggestion Challenge
- p566
- 기존 정형데이터 + 텍스트 마이닝 --> 지도학습