# NLP Basic Assignment
## NLP 맛보기 - spam.csv를 가지고 유의미한 해석을 도출해주세요!

In [3]:
# 필요한 모듈 불러오기

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다. 물론 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고, 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다 :)

In [4]:
spam = pd.read_csv('/content/drive/MyDrive/투빅스/spam.csv')

In [5]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [6]:
# spam -> 0, ham -> 1로 바꾸기
spam.v1 = spam.v1.replace(['ham','spam'],[0,1])
spam

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [7]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   int64 
 1   v2      5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


## Tokenizing


In [9]:
import nltk

In [12]:
# 특수 문자 제거 및 소문자로 변경
import re

def cleanText(data):
  text = re.sub('[^a-zA-Z]', ' ', data).lower()
  return text

spam.v2 = spam.v2.apply(cleanText)

In [13]:
# 예시 코드 코드
from nltk.tokenize import word_tokenize

nltk.download('punkt')
word_tokenize(spam.iloc[5]['v2'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['freemsg',
 'hey',
 'there',
 'darling',
 'it',
 's',
 'been',
 'week',
 's',
 'now',
 'and',
 'no',
 'word',
 'back',
 'i',
 'd',
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'to',
 'send',
 'to',
 'rcv']

In [14]:
# 영어 불용어 리스트
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 
print(stop_words)

{'while', 'this', "haven't", 'nor', 'is', 'am', 'its', 'that', 'each', 'by', 'wasn', 'very', 'than', 'll', "shan't", "hadn't", "don't", "hasn't", 'about', 'yours', 'o', 'i', 'ain', 'such', 'with', 'didn', 'hers', 'no', 'had', 'him', 'out', 'into', 'because', 'down', 'same', "shouldn't", 'shan', 'your', 's', "that'll", "didn't", "weren't", 'all', 'mustn', 'now', "isn't", 'theirs', 'and', "it's", 'once', 'ours', 'our', "you'd", 'she', 'at', 'what', 'himself', 'whom', 'his', 've', 'has', 'don', 'or', 'them', 'which', 'under', 'to', 'myself', 'having', 'hadn', 'during', 'from', 'through', 'my', 't', 'wouldn', 'where', 'her', 'does', 'how', 'most', 'their', "you've", 'any', 'a', 'will', 'few', 'are', 'ourselves', 'be', 'for', 'if', 'he', 'needn', 'over', 'who', 'until', 'in', 'yourself', 'here', 'you', 'should', 'y', 'yourselves', 'why', 'me', "doesn't", 'do', 'we', 'there', 'only', 'd', 'did', 'these', 'ma', 'when', 'between', 'too', 'it', 'again', 'themselves', 'can', 'further', 'itself',

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
wd_test = spam.copy()
result=[]
for row in wd_test.v2:
  tokenize = word_tokenize(row)
  words = [word for word in tokenize if word not in stop_words] # 불용어 제거
  result.append(words)

wd_test.v2 = result
wd_test

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"
...,...,...
5567,1,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,0,"[b, going, esplanade, fr, home]"
5569,0,"[pity, mood, suggestions]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram, GloVe, FastText가 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# 정상 메일 토큰 빈도수
ct_vector1 = CountVectorizer(max_features= 500, stop_words = 'english', lowercase= False)
corpus_ham = sum(wd_test[wd_test.v1 == 0]['v2'], []) 
ct_ham = ct_vector1.fit_transform(corpus_ham)

In [19]:
count_ham = pd.DataFrame( {'word' : ct_vector1.get_feature_names(),
                           'count' : ct_ham.sum(axis = 0).flat})
count_ham.sort_values('count', ascending = False) #빈도수대로 정렬



Unnamed: 0,word,count
161,gt,318
247,lt,316
300,ok,287
157,got,245
453,ur,241
...,...,...
450,ugh,12
451,uncle,12
144,game,12
143,gal,12


In [18]:
# spam 메일 토큰 빈도수
ct_vector2 = CountVectorizer(max_features= 500, stop_words = 'english', lowercase= False)
corpus_spam = sum(wd_test[wd_test.v1 == 1]['v2'], []) 
ct_spam = ct_vector2.fit_transform(corpus_spam)

In [21]:
count_spam = pd.DataFrame( {'word' : ct_vector2.get_feature_names(),
                           'count' : ct_spam.sum(axis = 0).flat})
count_spam.sort_values('count', ascending = False)



Unnamed: 0,word,count
147,free,228
436,txt,170
446,ur,144
252,mobile,129
411,text,126
...,...,...
222,loan,4
438,txtin,4
212,largest,4
208,laid,4


In [23]:
#Word2Vec 의 skip-gram
from gensim.models import Word2Vec

model = Word2Vec(wd_test[wd_test.v1 == 1]['v2'], min_count=5, size=100, window=2, iter=200, sg=1) # skip-gram: 스팸

In [24]:
model.wv.most_similar('free') #free가 빈도수가 가장 높아서 free라는 단어를 기반으로 유사도 구함

[('minutes', 0.4158509373664856),
 ('tomorrow', 0.38493382930755615),
 ('arcade', 0.3659352660179138),
 ('stoptxt', 0.36134833097457886),
 ('half', 0.35910874605178833),
 ('phones', 0.34854406118392944),
 ('tariffs', 0.3451172113418579),
 ('sipix', 0.34073418378829956),
 ('anytime', 0.3378395736217499),
 ('sonyericsson', 0.337372362613678)]

In [28]:
# ham에 대해 학습
model2 = Word2Vec(wd_test[wd_test.v1 == 0]['v2'], min_count=5, size=100, window=2, iter=200, sg=1) # skip-gram: 스팸 노

In [29]:
model2.wv.most_similar('ok') #ok가 빈도수가 가장 높아서 ok라는 단어를 기반으로 유사도 구함

[('sed', 0.46579670906066895),
 ('uni', 0.3881266117095947),
 ('ard', 0.38141757249832153),
 ('home', 0.38025519251823425),
 ('din', 0.37558305263519287),
 ('cuz', 0.37191152572631836),
 ('worry', 0.36797699332237244),
 ('coffee', 0.3549058437347412),
 ('mm', 0.3509550094604492),
 ('sexy', 0.35051658749580383)]

#CBOW

In [26]:
model = Word2Vec(wd_test[wd_test.v1 == 1]['v2'], min_count=5, size=100, window=2, iter=200, sg=0) # CBOW: 스킵그램이랑 같이 진행

In [27]:
model.wv.most_similar('free') #free가 빈도수가 가장 높아서 free라는 단어를 기반으로 유사도 구함

[('minutes', 0.39282387495040894),
 ('plus', 0.3881177306175232),
 ('latest', 0.38793638348579407),
 ('st', 0.3876240849494934),
 ('phones', 0.3786889612674713),
 ('get', 0.3667924404144287),
 ('arcade', 0.35121119022369385),
 ('fun', 0.34342044591903687),
 ('mths', 0.33908140659332275),
 ('may', 0.33262723684310913)]

In [30]:
model2 = Word2Vec(wd_test[wd_test.v1 == 0]['v2'], min_count=5, size=100, window=2, iter=200, sg=0)

In [31]:
model2.wv.most_similar('ok') 

[('hi', 0.3172110319137573),
 ('din', 0.2852504849433899),
 ('ard', 0.2762475907802582),
 ('home', 0.2752530574798584),
 ('anything', 0.2465279996395111),
 ('cuz', 0.24640731513500214),
 ('yar', 0.24214684963226318),
 ('going', 0.23302972316741943),
 ('plans', 0.22402477264404297),
 ('worry', 0.2228194922208786)]

## 본인이 도출해낸 해석을 적어주세요!

- 유사도, Wordcloud, 이진 분류 모델, Plot 뭐든 상관없으니 분명하고 인상적인 해석을 적어주시면 됩니다.

보편적으로 skipgram이 cbow보다 우수한 것 같다. 
half, tariffs 와 같이 free와 같이 쓰이거나 유사한 단어들의 임베딩이 비슷하기 때문이다.
