In [1]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [2]:
import numpy as np

file_path = "SMSSpamCollection.dat 파일 위치"

x_data, y_data = [], []
with open(file_path,'r',encoding='utf8') as inFile:
  lines = inFile.readlines()

lines = lines[:100]

for line in lines:
  line = line.strip().split('\t')
  sentence, label = line[1], line[0]
  x_data.append(sentence)
  y_data.append(label)

print("x_data의 개수 : " + str(len(x_data)))
print("y_data의 개수 : " + str(len(y_data)))

x_data의 개수 : 100
y_data의 개수 : 100


In [3]:
#불용어 제거

import nltk
nltk.download('stopwords')

print('영어 불용어 갯수:', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:10])

영어 불용어 갯수: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def remove_stopwords(sentence):
  #nltk의 불용어 리스트
  stop_words = set(stopwords.words('english'))

  #문장을 단어로 토큰화
  words = word_tokenize(sentence)

  #불용어를 제거한 새로운 문장 생성
  filtered_sentence = [word for word in words if word.lower() not in stop_words]

  #문장으로 결합
  new_sentence = ' '.join(filtered_sentence)

  return new_sentence

for x in x_data:
  x = remove_stopwords(x)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

label2index_dict = {'spam':0, 'ham':1}

indexing_x_data, indexing_y_data = [], []

for label in y_data:
  indexing_y_data.append(label2index_dict[label])

tokenizer.fit_on_texts(x_data)

indexing_x_data = tokenizer.texts_to_sequences(x_data)

print("x_data indexing 하기 전 : " + str(x_data[0]))
print("x_data indexing 하기 후 : " + str(indexing_x_data[0]))
print("y_data indexing 하기 전 : " + str(y_data[0]))
print("y_data indexing 하기 후 : " + str(indexing_y_data[0]))

x_data indexing 하기 전 : Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
x_data indexing 하기 후 : [38, 93, 239, 240, 241, 242, 53, 11, 243, 72, 94, 244, 245, 126, 246, 247, 73, 74, 248, 127]
y_data indexing 하기 전 : ham
y_data indexing 하기 후 : 1


In [6]:
from sklearn.svm import SVC

max_length = 60
for index in range(len(indexing_x_data)):
  length = len(indexing_x_data[index])

  if(length > max_length):
    indexing_x_data[index] = indexing_x_data[index][:max_length]
  elif(length < max_length):
    indexing_x_data[index] = indexing_x_data[index] + [0]*(max_length-length)


number_of_train = int(len(indexing_x_data)*0.9)

train_x = indexing_x_data[:number_of_train]
train_y = indexing_y_data[:number_of_train]
test_x = indexing_x_data[number_of_train:]
test_y = indexing_y_data[number_of_train:]

print("train_x의 개수 : " + str(len(train_x)))
print("train_y의 개수 : " + str(len(train_y)))
print("test_x의 개수 : " + str(len(test_x)))
print("test_y의 개수 : " + str(len(test_y)))

svm = SVC(kernel='linear', C=1e10)
svm.fit(train_x, train_y)

train_x의 개수 : 90
train_y의 개수 : 90
test_x의 개수 : 10
test_y의 개수 : 10


In [7]:
predict = svm.predict(test_x)

correct_count = 0
for index in range(len(predict)):
  if(test_y[index] == predict[index]):
    correct_count += 1

accuracy = 100.0*correct_count/len(test_y)


print("Accuracy: " + str(accuracy))

index2label = {0:"spam", 1:"ham"}

test_x_word = tokenizer.sequences_to_texts(test_x)

for index in range(len(test_x_word)):
  print()
  print("문장 : ", test_x_word[index])
  print("정답 : ", index2label[test_y[index]])
  print("모델 출력 : ", index2label[predict[index]])

Accuracy: 80.0

문장 :  yeah do don‘t stand to close tho you‘ll catch something
정답 :  ham
모델 출력 :  spam

문장 :  sorry to be a pain is it ok if we meet another night i spent late afternoon in casualty and that means i haven't done any of y stuff42moro and that includes all my time sheets and that sorry
정답 :  ham
모델 출력 :  spam

문장 :  smile in pleasure smile in pain smile when trouble pours like rain smile when sum1 hurts u smile becoz someone still loves to see u smiling
정답 :  ham
모델 출력 :  ham

문장 :  please call our customer service representative on 0800 169 6031 between 10am 9pm as you have won a guaranteed ￡1000 cash or ￡5000 prize
정답 :  spam
모델 출력 :  spam

문장 :  havent planning to buy later i check already lido only got 530 show in e afternoon u finish work already
정답 :  ham
모델 출력 :  ham

문장 :  your free ringtone is waiting to be collected simply text the password mix to 85069 to verify get usher and britney fml po box 5249 mk17 92h 450ppw 16
정답 :  spam
모델 출력 :  spam

문장 :  watching tel

In [14]:
#TF-IDF 방법을 이용하여 성능 개선

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

# TF-IDF 벡터라이저 초기화
vectorizer = TfidfVectorizer()

# x_data에 대해 fit_transform 수행
tfidf_x_data = vectorizer.fit_transform(x_data)

#데이터 분할 (90% 학습, 10% 테스트)
train_x, test_x, train_y, test_y = train_test_split(tfidf_x_data,indexing_y_data, test_size=0.1, random_state=42)

print("train_x의 개수 : " + str(csr_matrix(train_x).getnnz()))
print("train_y의 개수 : " + str(len(train_y)))
print("test_x의 개수 : " + str(csr_matrix(test_x).getnnz()))
print("test_y의 개수 : " + str(len(test_y)))

# SVM 모델 학습
svm = SVC(kernel='linear', C=1e10)
svm.fit(train_x.toarray(), train_y)

train_x의 개수 : 1265
train_y의 개수 : 90
test_x의 개수 : 142
test_y의 개수 : 10


In [17]:
predict = svm.predict(test_x.toarray())

accuracy = 100.0 * sum(test_y == predict) / len(test_y)

print("Accuracy: " + str(accuracy))

for index in range(len(test_x_word)):
  print()
  print("문장 : ", test_x_word[index])
  print("정답 : ", index2label[test_y[index]])
  print("모델 출력 : ", index2label[predict[index]])

Accuracy: 100.0

문장 :  yeah do don‘t stand to close tho you‘ll catch something
정답 :  ham
모델 출력 :  ham

문장 :  sorry to be a pain is it ok if we meet another night i spent late afternoon in casualty and that means i haven't done any of y stuff42moro and that includes all my time sheets and that sorry
정답 :  ham
모델 출력 :  ham

문장 :  smile in pleasure smile in pain smile when trouble pours like rain smile when sum1 hurts u smile becoz someone still loves to see u smiling
정답 :  ham
모델 출력 :  ham

문장 :  please call our customer service representative on 0800 169 6031 between 10am 9pm as you have won a guaranteed ￡1000 cash or ￡5000 prize
정답 :  ham
모델 출력 :  ham

문장 :  havent planning to buy later i check already lido only got 530 show in e afternoon u finish work already
정답 :  ham
모델 출력 :  ham

문장 :  your free ringtone is waiting to be collected simply text the password mix to 85069 to verify get usher and britney fml po box 5249 mk17 92h 450ppw 16
정답 :  ham
모델 출력 :  ham

문장 :  watching telugu m