In [10]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from keras_tuner import RandomSearch
from keras_tuner import HyperParameters


In [11]:
df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Подготовка данных
df['Category'] = df["Category"].map({"ham": 0, "spam": 1})

In [13]:
X = df['Message']
y = df['Category']

X_train, X_temp, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_test, test_size=0.5, random_state=13)

In [14]:
max_words = 5000

vectorizer = TfidfVectorizer(max_features=max_words)
X_train = vectorizer.fit_transform(X_train).toarray()
X_val = vectorizer.transform(X_val).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [15]:
def build_model(hp: HyperParameters):
    model = Sequential()
    model.add(layers.Input(shape=(max_words,)))

    model.add(layers.Dense(hp.Int("dense-1", 64, 256, 32), activation="relu"))
    model.add(layers.Dropout(hp.Float("dropout-1", 0.1, 0.5, 0.1)))
    model.add(layers.Dense(hp.Int("dense-2", 32, 128, 32), activation="relu"))
    model.add(layers.Dropout(hp.Float("dropout-2", 0.0, 0.3, 0.1)))

    model.add(layers.Dense(1, activation="sigmoid"))

    model.compile(
        optimizer=hp.Choice("optimizer", values=["adam", "rmsprop"]),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [16]:
tuner_search = RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=5,
    directory="output",
    project_name="spam_classification",
)

tuner_search.search(X_train, y_train, epochs=20, validation_data=(X_val, y_val))

best_model = tuner_search.get_best_models(num_models=1)[0]
best_model.summary()

Reloading Tuner from output\spam_classification\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


In [17]:
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9761 - loss: 0.0826  
Test Accuracy: 97.61%


In [19]:
with open('spam_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
best_model.save("spam_model.keras")