In [1]:
import random

import numpy as np
import pandas as pd
from keras import Model
from keras.src.layers import Bidirectional
from keras.src.layers import Embedding, LSTM, Dense
from keras import Sequential
from keras.src.utils import pad_sequences
from keras.src.legacy.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from typing import Tuple
from pathlib import Path

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

BODY_COL_NAME = 'body'
LABEL_COL_NAME = 'label'

In [2]:
def read_csv(path: str, body_column_name=BODY_COL_NAME, label_column_name=LABEL_COL_NAME) -> pd.DataFrame:
    df = pd.read_csv(path, quotechar='"', delimiter=',', usecols=[body_column_name, label_column_name])
    df = df.reindex(columns=[body_column_name, label_column_name])
    df.columns = [BODY_COL_NAME, LABEL_COL_NAME]
    return df

In [3]:
def clean_text(text):
    text = text.lower()

    for symbol in ['/', '-', '=', '+', 'fw:', 're:', '.']:
        text = text.replace(symbol, '')

    text = text.replace('  ', ' ')
    return text.strip()


In [4]:
def create_model() -> Model:
    max_words = 10000
    model = Sequential()
    model.add(Embedding(input_dim=max_words + 1, output_dim=128))
    model.add(Bidirectional(LSTM(units=64)))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model



In [5]:
def train_model(
        model: Model,
        X_train_pad: pd.DataFrame,
        y_train: pd.DataFrame,
        X_test_pad: pd.DataFrame,
        y_test: pd.DataFrame,
        epochs=5,
):
    history = model.fit(
        X_train_pad,
        y_train,
        epochs=epochs,
        batch_size=32,
        validation_data=(X_test_pad, y_test)
    )

    print(history)

In [6]:
def validate_model(model: Model, X_test: pd.DataFrame, y_test: pd.DataFrame):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {accuracy:.2f}')


In [7]:
def save_model(model: Model, name: str):
    model.save(f'{name}.h5')
    print("Модель сохранена!")

In [8]:
def load_model(name: str):
    from keras.src.saving import load_model

    loaded_model = load_model(f'{name}.h5')
    print("Модель загружена!")
    loaded_model.summary()  # Вывод информации о загруженной модели.

In [9]:
def train_test_prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df[BODY_COL_NAME] = df[BODY_COL_NAME].apply(clean_text)
    df = df.loc[df[BODY_COL_NAME].str.len() > 0]
    X = df[BODY_COL_NAME]
    y = df[LABEL_COL_NAME].values.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

    # Параметры токенизации.
    max_words = 10000  # Максимальное количество слов для токенации.
    max_length = 10  # Максимальная длина последовательности.

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

    return X_train_pad, X_test_pad, y_train, y_test


In [11]:
data = [
    ('data/original/spam.csv', 'text', LABEL_COL_NAME),
    ('data/original/fishing.csv', BODY_COL_NAME, LABEL_COL_NAME),
    ('data/original/fraud.csv', 'Body', 'Label'),
]

for file_path, body_col_name, label_col_name in data:
    df = read_csv(file_path, body_column_name=body_col_name, label_column_name=label_col_name)
    print(file_path)
    print(df)
    X_train_pad, X_test_pad, y_train, y_test = train_test_prepare_data(df)
    model = create_model()
    train_model(model, X_train_pad, y_train, X_test_pad, y_test, epochs=3)
    validate_model(model, X_test_pad, y_test)
    model_name = Path(file_path).name.replace('.csv', '')
    save_model(model, f'models/{model_name}')



data/original/spam.csv
                                                    body  label
0      ounce feather bowl hummingbird opec moment ala...      1
1      wulvob get your medircations online qnb ikud v...      1
2       computer connection from cnn com wednesday es...      0
3      university degree obtain a prosperous future m...      1
4      thanks for all your answers guys i know i shou...      0
...                                                  ...    ...
83443  hi given a date how do i get the last date of ...      0
83444  now you can order software on cd or download i...      1
83445  dear valued member canadianpharmacy provides a...      1
83446  subscribe change profile contact us long term ...      0
83447  get the most out of life ! viagra has helped m...      1

[83448 rows x 2 columns]


None
Epoch 1/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - accuracy: 0.9030 - loss: 0.2270 - val_accuracy: 0.9484 - val_loss: 0.1340
Epoch 2/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9691 - loss: 0.0841 - val_accuracy: 0.9526 - val_loss: 0.1366
Epoch 3/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9805 - loss: 0.0519 - val_accuracy: 0.9488 - val_loss: 0.1468
<keras.src.callbacks.history.History object at 0x000001355E0D22E0>
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9473 - loss: 0.1468




Test Accuracy: 0.95
Модель сохранена!
data/original/fishing.csv
                                                    body  label
0      ( see attached file : hplno 525 . xls )\r\n- h...      0
1      - - - - - - - - - - - - - - - - - - - - - - fo...      0
2      estimated actuals\r\nmarch 30 , 2001\r\nno flo...      0
3      ( see attached file : hplno 530 . xls )\r\n- h...      0
4      ( see attached file : hplno 601 . xls )\r\n- h...      0
...                                                  ...    ...
29762  hello ,\r\nmy boyfriend began having problems ...      1
29763  love - potion for your darling is all you want...      1
29764  you have feelings of guilt and embarrassment  ...      1
29765  spur - m formula\r\nincrease sperm production ...      1
29766  hello , welcome to the medzonlin claiming e\r\...      1

[29767 rows x 2 columns]


None
Epoch 1/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.8586 - loss: 0.3042 - val_accuracy: 0.9431 - val_loss: 0.1491
Epoch 2/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9680 - loss: 0.0829 - val_accuracy: 0.9444 - val_loss: 0.1678
Epoch 3/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9843 - loss: 0.0448 - val_accuracy: 0.9360 - val_loss: 0.2337
<keras.src.callbacks.history.History object at 0x000001354D0EC130>
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9310 - loss: 0.2316




Test Accuracy: 0.94
Модель сохранена!
data/original/fraud.csv
                                                     body  label
0       Status John: I'm not really sure what happened...      0
1       re:summer inverses i suck-hope youve made more...      0
2       The WTI Bullet swap contracts Hi, Following th...      0
3       Fwd: NYTimes.com Article: Suspended Rabbi Quit...      0
4       daily charts and matrices as hot links 5/15 Th...      0
...                                                   ...    ...
447412  Review Board Books w/Rebecca C./BillB/DaveG/Mi...      0
447413  Audit Committee Materials meeting Two meetings...      0
447414  Credit Story Rick/Bill/David, Generally, we ha...      0
447415  Commodity Group Limit Issue In addition to the...      0
447416  Calley Hayes with Deutsch Bank Rick, Calley's ...      0

[447417 rows x 2 columns]


None
Epoch 1/3
[1m11165/11165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 6ms/step - accuracy: 0.9949 - loss: 0.0271 - val_accuracy: 0.9964 - val_loss: 0.0143
Epoch 2/3
[1m11165/11165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 5ms/step - accuracy: 0.9967 - loss: 0.0109 - val_accuracy: 0.9967 - val_loss: 0.0131
Epoch 3/3
[1m11165/11165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 5ms/step - accuracy: 0.9976 - loss: 0.0082 - val_accuracy: 0.9966 - val_loss: 0.0131
<keras.src.callbacks.history.History object at 0x00000135616C99D0>
[1m2792/2792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9966 - loss: 0.0131




Test Accuracy: 1.00
Модель сохранена!
