In [2]:
import pandas as pd
from keras import Model
from keras.src.layers import Bidirectional
from keras.src.layers import Embedding, LSTM, Dense
from keras import Sequential
from sklearn.model_selection import train_test_split
from typing import Tuple
from pathlib import Path
from app.utils import create_pad_sequences, clean_text, save_model, create_tokenizer, RANDOM_SEED, BODY_COL_NAME, \
    LABEL_COL_NAME, save_tokenizer
from keras.src.legacy.preprocessing.text import Tokenizer
from app.utils import max_words

ROOT_DIR = '../../'

In [3]:
def read_csv(path: str, body_column_name=BODY_COL_NAME, label_column_name=LABEL_COL_NAME) -> pd.DataFrame:
    # Загружаем датасет, используя только необходимые колонки.
    df = pd.read_csv(path, quotechar='"', delimiter=',', usecols=[body_column_name, label_column_name])
    df = df.reindex(columns=[body_column_name, label_column_name])
    # Переименовываем колонки для приведению к общему виду.
    df.columns = [BODY_COL_NAME, LABEL_COL_NAME]
    return df

In [4]:
def create_model() -> Model:
    model = Sequential()
    model.add(Embedding(input_dim=max_words + 1, output_dim=128))
    model.add(Bidirectional(LSTM(units=64)))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model



In [5]:
def train_model(
        model: Model,
        X_train_pad: pd.DataFrame,
        y_train: pd.DataFrame,
        X_test_pad: pd.DataFrame,
        y_test: pd.DataFrame,
        epochs=5,
):
    return model.fit(
        X_train_pad,
        y_train,
        epochs=epochs,
        batch_size=32,
        validation_data=(X_test_pad, y_test)
    )

In [6]:
def validate_model(model: Model, X_test: pd.DataFrame, y_test: pd.DataFrame):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {accuracy:.2f}')


In [7]:
def train_test_prepare_data(
        df: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, Tokenizer]:
    #Очищаем текст от лишних символов
    df[BODY_COL_NAME] = df[BODY_COL_NAME].apply(clean_text)
    # Фильтруем пустые строки
    df = df.loc[df[BODY_COL_NAME].str.len() > 0]
    X = df[BODY_COL_NAME]
    y = df[LABEL_COL_NAME].values.astype(int)

    #Разделяем данные на обучающий и тестовый набор в пропорции 80 на 20 %
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
    # Создаем токенизатор для наших данных
    tokenizer = create_tokenizer(X_train)
    # Токенизируем текст
    X_train_pad = create_pad_sequences(tokenizer, X_train)
    X_test_pad = create_pad_sequences(tokenizer, X_test)

    return X_train_pad, X_test_pad, y_train, y_test, tokenizer


In [9]:
# Задаем последовательность и параметры для создания моделей
# (Путь к файлу с датасетом, Имя колонки с текстом, Имя колонки с label)
data = [
    (f'{ROOT_DIR}data/original/spam.csv', 'text', LABEL_COL_NAME),
    (f'{ROOT_DIR}data/original/phishing.csv', BODY_COL_NAME, LABEL_COL_NAME),
    (f'{ROOT_DIR}data/original/fraud.csv', 'Body', 'Label'),
]

# По очереди создаем модели по заданным входным данным
for file_path, body_col_name, label_col_name in data:
    df = read_csv(file_path, body_column_name=body_col_name, label_column_name=label_col_name)
    print(file_path)
    print(df)
    X_train_pad, X_test_pad, y_train, y_test, tokenizer = train_test_prepare_data(df)
    model = create_model()
    train_model(model, X_train_pad, y_train, X_test_pad, y_test, epochs=3)
    validate_model(model, X_test_pad, y_test)
    model_name = Path(file_path).name.replace('.csv', '')
    save_model(model, f'{ROOT_DIR}models/{model_name}')
    save_tokenizer(tokenizer, f'{ROOT_DIR}tokenizers/{model_name}')



../../data/original/spam.csv
                                                    body  label
0      ounce feather bowl hummingbird opec moment ala...      1
1      wulvob get your medircations online qnb ikud v...      1
2       computer connection from cnn com wednesday es...      0
3      university degree obtain a prosperous future m...      1
4      thanks for all your answers guys i know i shou...      0
...                                                  ...    ...
83443  hi given a date how do i get the last date of ...      0
83444  now you can order software on cd or download i...      1
83445  dear valued member canadianpharmacy provides a...      1
83446  subscribe change profile contact us long term ...      0
83447  get the most out of life ! viagra has helped m...      1

[83448 rows x 2 columns]


None
Epoch 1/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9032 - loss: 0.2287 - val_accuracy: 0.9522 - val_loss: 0.1272
Epoch 2/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9695 - loss: 0.0837 - val_accuracy: 0.9517 - val_loss: 0.1242
Epoch 3/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9790 - loss: 0.0556 - val_accuracy: 0.9528 - val_loss: 0.1409
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9499 - loss: 0.1460




Test Accuracy: 0.95
Модель сохранена!
../../data/original/phishing.csv
                                                    body  label
0      ( see attached file : hplno 525 . xls )\r\n- h...      0
1      - - - - - - - - - - - - - - - - - - - - - - fo...      0
2      estimated actuals\r\nmarch 30 , 2001\r\nno flo...      0
3      ( see attached file : hplno 530 . xls )\r\n- h...      0
4      ( see attached file : hplno 601 . xls )\r\n- h...      0
...                                                  ...    ...
29762  hello ,\r\nmy boyfriend began having problems ...      1
29763  love - potion for your darling is all you want...      1
29764  you have feelings of guilt and embarrassment  ...      1
29765  spur - m formula\r\nincrease sperm production ...      1
29766  hello , welcome to the medzonlin claiming e\r\...      1

[29767 rows x 2 columns]


None
Epoch 1/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8514 - loss: 0.3023 - val_accuracy: 0.9389 - val_loss: 0.1589
Epoch 2/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9710 - loss: 0.0815 - val_accuracy: 0.9362 - val_loss: 0.1741
Epoch 3/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9838 - loss: 0.0467 - val_accuracy: 0.9389 - val_loss: 0.1893
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9363 - loss: 0.2017




Test Accuracy: 0.94
Модель сохранена!
../../data/original/fraud.csv
                                                     body  label
0       Status John: I'm not really sure what happened...      0
1       re:summer inverses i suck-hope youve made more...      0
2       The WTI Bullet swap contracts Hi, Following th...      0
3       Fwd: NYTimes.com Article: Suspended Rabbi Quit...      0
4       daily charts and matrices as hot links 5/15 Th...      0
...                                                   ...    ...
447412  Review Board Books w/Rebecca C./BillB/DaveG/Mi...      0
447413  Audit Committee Materials meeting Two meetings...      0
447414  Credit Story Rick/Bill/David, Generally, we ha...      0
447415  Commodity Group Limit Issue In addition to the...      0
447416  Calley Hayes with Deutsch Bank Rick, Calley's ...      0

[447417 rows x 2 columns]


None
Epoch 1/3
[1m11164/11164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 5ms/step - accuracy: 0.9947 - loss: 0.0272 - val_accuracy: 0.9958 - val_loss: 0.0152
Epoch 2/3
[1m11164/11164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - accuracy: 0.9968 - loss: 0.0111 - val_accuracy: 0.9962 - val_loss: 0.0139
Epoch 3/3
[1m11164/11164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 4ms/step - accuracy: 0.9975 - loss: 0.0082 - val_accuracy: 0.9969 - val_loss: 0.0136
[1m2791/2791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9968 - loss: 0.0141




Test Accuracy: 1.00
Модель сохранена!
