In [9]:
import pandas as pd
from keras import Model
from keras.src.layers import Bidirectional
from keras.src.layers import Embedding, LSTM, Dense
from keras import Sequential
from sklearn.model_selection import train_test_split
from typing import Tuple
from pathlib import Path
from app.utils import create_pad_sequences, clean_text, save_model, create_tokenizer, RANDOM_SEED, BODY_COL_NAME, LABEL_COL_NAME, save_tokenizer
from keras.src.legacy.preprocessing.text import Tokenizer


ROOT_DIR = '../../'

In [10]:
def read_csv(path: str, body_column_name=BODY_COL_NAME, label_column_name=LABEL_COL_NAME) -> pd.DataFrame:
    df = pd.read_csv(path, quotechar='"', delimiter=',', usecols=[body_column_name, label_column_name])
    df = df.reindex(columns=[body_column_name, label_column_name])
    df.columns = [BODY_COL_NAME, LABEL_COL_NAME]
    return df

In [11]:
def create_model() -> Model:
    max_words = 10000
    model = Sequential()
    model.add(Embedding(input_dim=max_words + 1, output_dim=128))
    model.add(Bidirectional(LSTM(units=64)))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model



In [12]:
def train_model(
        model: Model,
        X_train_pad: pd.DataFrame,
        y_train: pd.DataFrame,
        X_test_pad: pd.DataFrame,
        y_test: pd.DataFrame,
        epochs=5,
):
    return model.fit(
        X_train_pad,
        y_train,
        epochs=epochs,
        batch_size=32,
        validation_data=(X_test_pad, y_test)
    )

In [13]:
def validate_model(model: Model, X_test: pd.DataFrame, y_test: pd.DataFrame):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {accuracy:.2f}')


In [14]:
def train_test_prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, Tokenizer]:
    df[BODY_COL_NAME] = df[BODY_COL_NAME].apply(clean_text)
    df = df.loc[df[BODY_COL_NAME].str.len() > 0]
    X = df[BODY_COL_NAME]
    y = df[LABEL_COL_NAME].values.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
    tokenizer = create_tokenizer(X_train)
    X_train_pad = create_pad_sequences(tokenizer, X_train)
    X_test_pad = create_pad_sequences(tokenizer, X_test)

    return X_train_pad, X_test_pad, y_train, y_test, tokenizer


In [15]:
data = [
    (f'{ROOT_DIR}data/original/spam.csv', 'text', LABEL_COL_NAME),
    (f'{ROOT_DIR}data/original/fishing.csv', BODY_COL_NAME, LABEL_COL_NAME),
    (f'{ROOT_DIR}data/original/fraud.csv', 'Body', 'Label'),
]

for file_path, body_col_name, label_col_name in data:
    df = read_csv(file_path, body_column_name=body_col_name, label_column_name=label_col_name)
    print(file_path)
    print(df)
    X_train_pad, X_test_pad, y_train, y_test, tokenizer = train_test_prepare_data(df)
    model = create_model()
    train_model(model, X_train_pad, y_train, X_test_pad, y_test, epochs=3)
    validate_model(model, X_test_pad, y_test)
    model_name = Path(file_path).name.replace('.csv', '')
    save_model(model, f'{ROOT_DIR}models/{model_name}')
    save_tokenizer(tokenizer, f'{ROOT_DIR}tokenizers/{model_name}')



../../data/original/spam.csv
                                                    body  label
0      ounce feather bowl hummingbird opec moment ala...      1
1      wulvob get your medircations online qnb ikud v...      1
2       computer connection from cnn com wednesday es...      0
3      university degree obtain a prosperous future m...      1
4      thanks for all your answers guys i know i shou...      0
...                                                  ...    ...
83443  hi given a date how do i get the last date of ...      0
83444  now you can order software on cd or download i...      1
83445  dear valued member canadianpharmacy provides a...      1
83446  subscribe change profile contact us long term ...      0
83447  get the most out of life ! viagra has helped m...      1

[83448 rows x 2 columns]


None
Epoch 1/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9028 - loss: 0.2270 - val_accuracy: 0.9501 - val_loss: 0.1310
Epoch 2/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9686 - loss: 0.0836 - val_accuracy: 0.9512 - val_loss: 0.1291
Epoch 3/3
[1m2087/2087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9810 - loss: 0.0510 - val_accuracy: 0.9512 - val_loss: 0.1481
You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9513 - loss: 0.1438




Test Accuracy: 0.95
Модель сохранена!
../../data/original/fishing.csv
                                                    body  label
0      ( see attached file : hplno 525 . xls )\r\n- h...      0
1      - - - - - - - - - - - - - - - - - - - - - - fo...      0
2      estimated actuals\r\nmarch 30 , 2001\r\nno flo...      0
3      ( see attached file : hplno 530 . xls )\r\n- h...      0
4      ( see attached file : hplno 601 . xls )\r\n- h...      0
...                                                  ...    ...
29762  hello ,\r\nmy boyfriend began having problems ...      1
29763  love - potion for your darling is all you want...      1
29764  you have feelings of guilt and embarrassment  ...      1
29765  spur - m formula\r\nincrease sperm production ...      1
29766  hello , welcome to the medzonlin claiming e\r\...      1

[29767 rows x 2 columns]


None
Epoch 1/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8479 - loss: 0.3111 - val_accuracy: 0.9402 - val_loss: 0.1527
Epoch 2/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9689 - loss: 0.0858 - val_accuracy: 0.9424 - val_loss: 0.1577
Epoch 3/3
[1m745/745[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9823 - loss: 0.0520 - val_accuracy: 0.9407 - val_loss: 0.1905
You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9401 - loss: 0.1847




Test Accuracy: 0.94
Модель сохранена!
../../data/original/fraud.csv
                                                     body  label
0       Status John: I'm not really sure what happened...      0
1       re:summer inverses i suck-hope youve made more...      0
2       The WTI Bullet swap contracts Hi, Following th...      0
3       Fwd: NYTimes.com Article: Suspended Rabbi Quit...      0
4       daily charts and matrices as hot links 5/15 Th...      0
...                                                   ...    ...
447412  Review Board Books w/Rebecca C./BillB/DaveG/Mi...      0
447413  Audit Committee Materials meeting Two meetings...      0
447414  Credit Story Rick/Bill/David, Generally, we ha...      0
447415  Commodity Group Limit Issue In addition to the...      0
447416  Calley Hayes with Deutsch Bank Rick, Calley's ...      0

[447417 rows x 2 columns]


None
Epoch 1/3
[1m11165/11165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 5ms/step - accuracy: 0.9948 - loss: 0.0277 - val_accuracy: 0.9960 - val_loss: 0.0152
Epoch 2/3
[1m11165/11165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 4ms/step - accuracy: 0.9968 - loss: 0.0112 - val_accuracy: 0.9967 - val_loss: 0.0134
Epoch 3/3
[1m11165/11165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 4ms/step - accuracy: 0.9975 - loss: 0.0084 - val_accuracy: 0.9970 - val_loss: 0.0124
You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
[1m2792/2792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9968 - loss: 0.0120




Test Accuracy: 1.00
Модель сохранена!
