# Spam Mail 분류

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving spam.csv to spam.csv


In [3]:
spam = pd.read_csv(filename, encoding='latin1')
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## 데이터 전처리

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = spam[['v1', 'v2']]
df['label'] = df.v1.apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,v1,v2,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
 2   label   5572 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


## 1. 텍스트 전처리

In [7]:
# 중복 확인
df.v2.nunique()

5169

In [8]:
# 중복 데이터 제거
df.drop_duplicates('v2', keep='first', inplace=True)

In [9]:
# 구둣점 제거
df['content'] = df.v2.str.replace('[^A-Za-z ]', '')
df.head()

Unnamed: 0,v1,v2,label,content
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...


In [None]:
# 소문자 변환 - Vectorizer를 이용해서 변환할 경우에는 할 필요없음
df['content'] = df.content.apply(lambda x: x.lower())

In [10]:
# 햄/스팸 메일 분포
df.v1.value_counts()

ham     4516
spam     653
Name: v1, dtype: int64

## 2. 학습 및 테스트 데이터 셋으로 분리

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.content, df.label, test_size=0.2,
    stratify=df.label, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

## 3. DTM 변환

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
cvect1 = CountVectorizer(stop_words='english')
cvect1.fit(X_train)
X_train_cv1 = cvect1.transform(X_train)
X_test_cv1 = cvect1.transform(X_test)
X_train_cv1.shape, X_test_cv1.shape

((4135, 7147), (1034, 7147))

In [14]:
cvect2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
cvect2.fit(X_train)
X_train_cv2 = cvect2.transform(X_train)
X_test_cv2 = cvect2.transform(X_test)
X_train_cv2.shape, X_test_cv2.shape

((4135, 29546), (1034, 29546))

In [15]:
tvect1 = TfidfVectorizer(stop_words='english')
tvect1.fit(X_train)
X_train_tv1 = tvect1.transform(X_train)
X_test_tv1 = tvect1.transform(X_test)
X_train_tv1.shape, X_test_tv1.shape

((4135, 7147), (1034, 7147))

In [16]:
tvect2 = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tvect2.fit(X_train)
X_train_tv2 = tvect2.transform(X_train)
X_test_tv2 = tvect2.transform(X_test)
X_train_tv2.shape, X_test_tv2.shape

((4135, 29546), (1034, 29546))

## 4. 로지스틱 회귀 모델로 학습

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr1 = LogisticRegression()
lr1.fit(X_train_cv1, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
lr2 = LogisticRegression()
lr2.fit(X_train_cv2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
lr3 = LogisticRegression()
lr3.fit(X_train_tv1, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
lr4 = LogisticRegression()
lr4.fit(X_train_tv2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## 5. 예측/평가

In [22]:
pred1 = lr1.predict(X_test_cv1)

In [23]:
pred2 = lr2.predict(X_test_cv2)

In [24]:
pred3 = lr3.predict(X_test_tv1)

In [25]:
pred4 = lr4.predict(X_test_tv2)

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
accuracy_score(y_test, pred1)

0.9777562862669246

In [28]:
accuracy_score(y_test, pred2)

0.9777562862669246

In [29]:
accuracy_score(y_test, pred3)

0.9593810444874274

In [30]:
accuracy_score(y_test, pred4)

0.9545454545454546