## 데이터 : 스팸 분류 영어 데이터
* 토크나이저 및 임베딩 모델 선택 과정, 인사이트 해석을 주석으로 달아주세요


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!pip install gensim --upgrade



In [None]:
import gensim
gensim.__version__

'4.3.2'

###  Step1. 데이터 확인

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/ToBigs/07. NLP basic/과제/spam.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Step2. Tokenizing (불용어 처리, 특수 문자 제거 등의 전처리 포함)

In [None]:
df['label'] = df['v1'].apply(lambda x : 1 if x == 'spam' else 0) # spam = 1 else 0
df.head()

Unnamed: 0,v1,v2,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
df['v2'][3]

'U dun say so early hor... U c already then say...'

In [None]:
# checking missing or duplicated values
df.isna().sum() # 0
df.duplicated().sum() # 403
print("before removal of duplicates", df.shape)
df = df.drop_duplicates(keep = 'first')
print("after removal of duplicates", df.shape)

before removal of duplicates (5572, 3)
after removal of duplicates (5169, 3)


In [None]:
stop_words = stopwords.words('english')
print(stop_words[:10]) # checking stop_words

porter = PorterStemmer() # PorterStemmer

def preprocessing(words):
    words = re.sub("[^a-zA-Z]"," ", words)  # text including only alphabets
    text = words.lower().split() # applying lower
    text = " ".join(text)
    text = [word.lower() for word in text.split() if word.lower() not in stop_words] # removing stopwords
    text = " ".join(text)
    stem_text = [porter.stem(word) for word in text.split()] # applying stemmization
    return " ".join(stem_text)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [None]:
df['preprocessed'] = df['v2'].apply(preprocessing)
df['v2'][3], df['preprocessed'][3] # checking preprocessed data

('U dun say so early hor... U c already then say...',
 'u dun say earli hor u c alreadi say')

In [None]:
df = df.drop(['v2', 'v1'], axis = 1) # using only preprocessed & label

In [None]:
df.head()

Unnamed: 0,label,preprocessed
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


### Step3. 임베딩 (One-hot encoding, Word2Vec, CBOW, Skip-gram, GloVe 등)

'비슷한 위치에서 등장하는 단어들은 비슷한 의미를 가진다'는 분포가설을 기반으로 한 Word2Vec 을 이용하여 임베딩을 구한다.

In [None]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

In [None]:
# gensim inputs should be tokenized
df['tokenized'] = df['preprocessed'].apply(lambda x: simple_preprocess(x, deacc=True))  # deacc=True removes punctuations
df.head()

Unnamed: 0,label,preprocessed,tokenized
0,0,go jurong point crazi avail bugi n great world...,"[go, jurong, point, crazi, avail, bugi, great,..."
1,0,ok lar joke wif u oni,"[ok, lar, joke, wif, oni]"
2,1,free entri wkli comp win fa cup final tkt st m...,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,0,u dun say earli hor u c alreadi say,"[dun, say, earli, hor, alreadi, say]"
4,0,nah think goe usf live around though,"[nah, think, goe, usf, live, around, though]"


In [None]:
sentences = df['tokenized'].tolist() # listing dataframe sentences

# model training
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

더 많은 정보를 바탕으로 특정 단어를 예측하기 떄문에 CBoW의 성능이 좋을 것으로 생각하기 쉽지만, 역전파 관점에서 보면 Skip-Gram에서 훨씬 더 많은 학습이 일어나기 때문에 Skip-Gram [code상 sg=1]의 성능이 조금 더 좋다고 하여, Skip-gram으로 모델 학습

In [None]:
nah_vector = model.wv['nah']  #Embedding vector for 'nah'
no_vector = model.wv['no'] #Embedding vector for 'no'
print("vectors for nah \n", nah_vector)
print("vectors for no \n", no_vector)

vectors for nah 
 [-0.09947477  0.09859527  0.08985845  0.01064183  0.07177369 -0.22011116
  0.02392751  0.2799557  -0.11294761 -0.08741838 -0.06204501 -0.23254025
 -0.07161842  0.03911138  0.04938085 -0.14987883 -0.01233083 -0.1749106
  0.01150913 -0.23898701  0.0875976   0.02254453  0.09191407 -0.11979992
 -0.07772668 -0.0267251  -0.07373431 -0.08549157 -0.03593607  0.02710004
  0.1189023   0.04454613  0.01955941 -0.03215    -0.06178718  0.16319498
  0.04152111 -0.07949487 -0.08888374 -0.28164506 -0.03542937 -0.10745866
 -0.07170666  0.04141929  0.11180147 -0.06177339 -0.04054942 -0.05983869
  0.06373281  0.13655037  0.05405421 -0.10687549 -0.03985167 -0.02230283
 -0.07795809  0.09416465  0.12006368 -0.02740352 -0.12703475  0.00452144
  0.05434053  0.05272863 -0.00258098 -0.03719848 -0.16159672  0.09904262
  0.09038809  0.15880379 -0.19628821  0.16467898 -0.07235643  0.04060009
  0.16864474 -0.05723217  0.09343157  0.05227699 -0.02033059 -0.0445514
 -0.17253163  0.08011387 -0.1018328

In [None]:
similar_words = model.wv.most_similar('no', topn=5)
print(similar_words)

[('rose', 0.9939366579055786), ('card', 0.993934690952301), ('bother', 0.9937601089477539), ('invit', 0.9937533736228943), ('locat', 0.9937261343002319)]


No와 비슷한 단어들을 5개 출력했을때 nah가 나오지 않았다. 모델 학습이 이상한지 확인하기 위해 이번에는 'rose'로 재진행

In [None]:
similar_words = model.wv.most_similar('rose', topn=5)
print(similar_words)

[('complet', 0.9983358979225159), ('saturday', 0.9982338547706604), ('card', 0.9982331991195679), ('crazi', 0.9982302188873291), ('eh', 0.9982057213783264)]


데이터 전처리 및 데이터에 오타가 많아 모델 성능이 좋지 않음으로 예상한다.

### Step4. 유의미한 해석 도출 (유사도, Wordcloud, 이진 분류 모델, 그래프 해석 등)

label이 있기에, 이진 분류 모델로 spam 분류를 목표로 설정한다.

In [None]:
df.columns

Index(['label', 'preprocessed', 'tokenized'], dtype='object')

In [None]:
import numpy as np

def sentence_vectorizer(sentence, model):
    words = [word for word in sentence if word in model.wv.key_to_index] # filtering words that are only in model vocab
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return np.zeros(model.vector_size)

df['sentence_vector'] = df['tokenized'].apply(lambda x: sentence_vectorizer(x, model))
df.head()

Unnamed: 0,label,preprocessed,tokenized,sentence_vector
0,0,go jurong point crazi avail bugi n great world...,"[go, jurong, point, crazi, avail, bugi, great,...","[-0.12832457, 0.122255765, 0.10579072, 0.01364..."
1,0,ok lar joke wif u oni,"[ok, lar, joke, wif, oni]","[-0.13673833, 0.13181812, 0.11824419, 0.015101..."
2,1,free entri wkli comp win fa cup final tkt st m...,"[free, entri, wkli, comp, win, fa, cup, final,...","[-0.10452374, 0.08014709, 0.10801754, 0.042509..."
3,0,u dun say earli hor u c alreadi say,"[dun, say, earli, hor, alreadi, say]","[-0.17376195, 0.1699798, 0.12331865, -0.007251..."
4,0,nah think goe usf live around though,"[nah, think, goe, usf, live, around, though]","[-0.15059403, 0.16297822, 0.12765633, -6.94688..."


In [None]:
X = np.array(df['sentence_vector'].tolist())
y = df['label'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

score_train = accuracy_score(y_train, y_pred_train)*100
score_test = accuracy_score(y_test, y_pred_test)*100

print(score_train, score_test)

100.0 96.80851063829788


### Step5. Week7_NLPBasic_Assignment.ipynb 제출