In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Bidirectional
from sklearn.metrics import roc_auc_score

# Load your dataset
data = pd.read_csv("train.csv")  # Replace with your actual file path

# Preprocess the target column
encoder = LabelEncoder()
data['category'] = encoder.fit_transform(data['category'])  # Adjust to 'category' or 'sub-category' as needed
num_classes = len(encoder.classes_)

# Handle missing or non-string values in 'crimeaditionalinfo' column
data['crimeaditionalinfo'] = data['crimeaditionalinfo'].fillna("").astype(str)

# Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['crimeaditionalinfo'])
X = tokenizer.texts_to_sequences(data['crimeaditionalinfo'])
X = pad_sequences(X, maxlen=100)  # Set maxlen based on the average text length in your dataset

# Convert labels to categorical (one-hot encoded) format for multi-class classification
y = to_categorical(data['category'], num_classes=num_classes)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a function to build different models
def build_model(model_type):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
    if model_type == 'SimpleRNN':
        model.add(SimpleRNN(64))
    elif model_type == 'LSTM':
        model.add(LSTM(64))
    elif model_type == 'GRU':
        model.add(GRU(64))
    elif model_type == 'Bi-LSTM':
        model.add(Bidirectional(LSTM(64)))
    model.add(Dense(num_classes, activation='softmax'))  # Use softmax for multi-class classification
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate each model
results = []
for model_type in ['SimpleRNN', 'LSTM', 'GRU', 'Bi-LSTM']:
    print(f"Training {model_type} model...")
    model = build_model(model_type)
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    
    # Calculate AUC score for multi-class classification using the 'ovr' strategy
    try:
        auc_score = roc_auc_score(y_test, y_pred, multi_class='ovr')
    except ValueError:
        auc_score = roc_auc_score(y_test, y_pred)

    results.append({'Model': model_type, 'AUC_Score': auc_score})

# Convert results to DataFrame and display
results_df = pd.DataFrame(results).sort_values(by='AUC_Score', ascending=False)
print(results_df)


Training SimpleRNN model...
Epoch 1/5




[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.6792 - loss: 1.0720 - val_accuracy: 0.7115 - val_loss: 0.8790
Epoch 2/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.7387 - loss: 0.7981 - val_accuracy: 0.6791 - val_loss: 0.9387
Epoch 3/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.7619 - loss: 0.7325 - val_accuracy: 0.6813 - val_loss: 0.9492
Epoch 4/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.7736 - loss: 0.6964 - val_accuracy: 0.6597 - val_loss: 1.0502
Epoch 5/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.7725 - loss: 0.7015 - val_accuracy: 0.6726 - val_loss: 1.0064
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Training LSTM model...
Epoch 1/5




[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 16ms/step - accuracy: 0.6995 - loss: 1.0070 - val_accuracy: 0.7440 - val_loss: 0.7632
Epoch 2/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 16ms/step - accuracy: 0.7589 - loss: 0.7083 - val_accuracy: 0.7518 - val_loss: 0.7202
Epoch 3/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 16ms/step - accuracy: 0.7923 - loss: 0.6106 - val_accuracy: 0.7516 - val_loss: 0.7184
Epoch 4/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 16ms/step - accuracy: 0.8235 - loss: 0.5210 - val_accuracy: 0.7440 - val_loss: 0.7649
Epoch 5/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 16ms/step - accuracy: 0.8542 - loss: 0.4336 - val_accuracy: 0.7366 - val_loss: 0.8270
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Training GRU model...
Epoch 1/5




[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 20ms/step - accuracy: 0.6999 - loss: 0.9985 - val_accuracy: 0.7520 - val_loss: 0.7374
Epoch 2/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 19ms/step - accuracy: 0.7741 - loss: 0.6753 - val_accuracy: 0.7440 - val_loss: 0.7137
Epoch 3/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 19ms/step - accuracy: 0.8051 - loss: 0.5775 - val_accuracy: 0.7502 - val_loss: 0.7377
Epoch 4/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 19ms/step - accuracy: 0.8395 - loss: 0.4730 - val_accuracy: 0.7362 - val_loss: 0.8033
Epoch 5/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 19ms/step - accuracy: 0.8777 - loss: 0.3731 - val_accuracy: 0.7226 - val_loss: 0.9417
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
Training Bi-LSTM model...
Epoch 1/5




[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 20ms/step - accuracy: 0.7156 - loss: 0.9357 - val_accuracy: 0.7424 - val_loss: 0.7695
Epoch 2/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 20ms/step - accuracy: 0.7605 - loss: 0.7185 - val_accuracy: 0.7494 - val_loss: 0.7397
Epoch 3/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 21ms/step - accuracy: 0.7857 - loss: 0.6310 - val_accuracy: 0.7465 - val_loss: 0.7416
Epoch 4/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 30ms/step - accuracy: 0.8120 - loss: 0.5537 - val_accuracy: 0.7479 - val_loss: 0.7632
Epoch 5/5
[1m1874/1874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 20ms/step - accuracy: 0.8437 - loss: 0.4683 - val_accuracy: 0.7418 - val_loss: 0.8346
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step
       Model  AUC_Score
2        GRU   0.859584
3    Bi-LSTM   0.856443
1       LSTM   0.831739
0  SimpleRNN  

: 