### SMS SPAM 분류 

In [1]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [2]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [3]:
# Selection
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# 결측치 확인
df.isna().sum().sum()

0

In [5]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [6]:
df.drop_duplicates(subset=['v2'],inplace=True)
df.shape

(5169, 2)

In [7]:
# ['ham','spam'] ---> [0,1]
df.v1 = df.v1.replace(['ham','spam'],[0,1],regex=True)
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.v1.value_counts()

v1
0    4516
1     653
Name: count, dtype: int64

In [9]:
# x = df.v2.values
# y = df.v1.values

- 텍스트 전처리

In [10]:
# 구둣점, 숫자 제거
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ',regex=True)

In [11]:
df.v2[0]

'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

- 데이터셋 분리

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values,df.v1.values,stratify=df.v1.values,test_size=0.2,random_state=2023
)

- Pipeline으로 베스트 파라메터 찾기   
    - CVECT, RFC

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [14]:
params = {
    'CVECT__ngram_range':[(1,1),(1,2)],
    'RFC__max_depth':[2,5,8]
}

In [15]:
cvect = CountVectorizer(stop_words='english')
rfc = RandomForestClassifier(random_state=2023)
pipeline = Pipeline([('CVECT',cvect),('RFC',rfc)])
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)

CPU times: total: 7.3 s
Wall time: 7.47 s


In [16]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'RFC__max_depth': 8}

In [17]:
params = {
    'RFC__max_depth':[7,8,10,12]
}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)

CPU times: total: 5.62 s
Wall time: 5.76 s


In [18]:
grid_pipe.best_params_

{'RFC__max_depth': 12}

In [19]:
params = {
    'RFC__max_depth':[12,15,18]
}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)
grid_pipe.best_params_

CPU times: total: 6.08 s
Wall time: 6.29 s


{'RFC__max_depth': 18}

In [20]:
params = {
    'RFC__max_depth':[16,18,20]
}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)
grid_pipe.best_params_

CPU times: total: 7.09 s
Wall time: 7.84 s


{'RFC__max_depth': 20}

In [21]:
grid_pipe.best_estimator_.score(X_test,y_test)

0.9352030947775629

In [22]:
params = {
    'CVECT__ngram_range':[(1,1),(1,2)],
    'RFC__max_depth':[2,10,30]
}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)
grid_pipe.best_params_

CPU times: total: 13.4 s
Wall time: 14 s


{'CVECT__ngram_range': (1, 1), 'RFC__max_depth': 30}

In [23]:
params = {
    'RFC__max_depth':[30,50,80]
}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)
grid_pipe.best_params_

CPU times: total: 15.3 s
Wall time: 16.1 s


{'RFC__max_depth': 80}

In [24]:
grid_pipe.best_estimator_.score(X_test,y_test)


0.9632495164410058

In [25]:
params = {
    'RFC__max_depth':[120,150,180]
}
grid_pipe = GridSearchCV(pipeline,params,scoring='accuracy',cv=3)
%time grid_pipe.fit(X_train,y_train)
grid_pipe.best_params_

CPU times: total: 21.9 s
Wall time: 22.6 s


{'RFC__max_depth': 120}

In [26]:
grid_pipe.best_estimator_.score(X_test,y_test)

0.9680851063829787

In [27]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023, max_iter=500)

In [28]:
cvect2 = CountVectorizer(stop_words='english',ngram_range=(1,2))
cvect2.fit(X_train)
X_train_cv2 = cvect2.transform(X_train)
X_test_cv2 = cvect2.transform(X_test)
X_train_cv2.shape, X_test_cv2.shape

((4135, 28822), (1034, 28822))

In [29]:
lrc2 = LogisticRegression(random_state=2023, max_iter=500)
%time lrc2.fit(X_train_cv2,y_train)

CPU times: total: 234 ms
Wall time: 235 ms


In [30]:
lrc2.score(X_test_cv2,y_test)

0.9680851063829787