In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# 데이터 로드
# 데이터 형태는 Message 내용인 message Column과 Spam인지 아닌지를 나타내는 label Column으로 구성되어 있음
# Spam 문자는 spam, 일반 문자는 ham으로 구분
data = pd.read_csv('/home/moonguigon/work/sms_spam/sms_spam.csv')
X = data['message']
y = data['label']

In [17]:
# 데이터 전처리 및 벡터화
# tf-idf 결합을 통해 단어의 가중치를 나타냄(TfidfVectorizer())
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

In [18]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [19]:
# 모델 학습
model = MultinomialNB()
model.fit(X_train, y_train)

In [20]:
# 모델 평가
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.75
              precision    recall  f1-score   support

         ham       0.00      0.00      0.00         3
        spam       0.75      1.00      0.86         9

    accuracy                           0.75        12
   macro avg       0.38      0.50      0.43        12
weighted avg       0.56      0.75      0.64        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
def predict_spam(message):
    # 입력된 메시지를 TF-IDF 벡터로 변환
    message_tfidf = vectorizer.transform([message])
    # 예측
    prediction = model.predict(message_tfidf)
    # 결과 반환
    return "spam" if prediction[0] == 1 else "ham"

In [27]:
print(predict_spam("[Web발신]나는 Spam 문자입니다"))

ham
