In [70]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump
import os

In [73]:
# Load dataset
df = pd.read_csv("Incident_dataset.csv")
y = df["label"]

In [74]:
# Load SBERT (MiniLM)
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate SBERT embeddings
print("Generating SBERT embeddings...")
sbert_embeddings = sbert_model.encode(df["message"].tolist(), convert_to_numpy=True)
print("SBERT embedding shape:", sbert_embeddings.shape)

# Generate TF-IDF features
print("Generating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=200)
tfidf_features = tfidf_vectorizer.fit_transform(df["message"]).toarray()
print("TF-IDF shape:", tfidf_features.shape)

# Save TF-IDF vectorizer
os.makedirs("models", exist_ok=True)
dump(tfidf_vectorizer, "models/tfidf_vectorizer.joblib")
print("TF-IDF vectorizer saved to models/tfidf_vectorizer.joblib")

# Combine SBERT and TF-IDF
X = np.concatenate((sbert_embeddings, tfidf_features), axis=1)
print("Final feature shape:", X.shape)

# Optionally save X and y for later use
dump(X, "models/features_X.joblib")
dump(y, "models/labels_y.joblib")
print("Features and labels saved.")


Generating SBERT embeddings...
SBERT embedding shape: (15000, 384)
Generating TF-IDF features...
TF-IDF shape: (15000, 200)
TF-IDF vectorizer saved to models/tfidf_vectorizer.joblib
Final feature shape: (15000, 584)
Features and labels saved.


In [75]:
# Encode labels
y_raw = df["label"].values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
joblib.dump(label_encoder, "models/label_encoder.joblib")

['models/label_encoder.joblib']

In [76]:
# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
# Step 4: Train Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
joblib.dump(rf, "models/rf_classifier.joblib")

Training Random Forest...


['models/rf_classifier.joblib']

In [78]:
# Step 5: Train Logistic Regression
print("Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
joblib.dump(lr, "models/lr_classifier.joblib")

Training Logistic Regression...


['models/lr_classifier.joblib']

In [79]:
# Step 6: Train XGBoost
print("Training XGBoost...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
joblib.dump(xgb, "models/xgb_classifier.joblib")

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



['models/xgb_classifier.joblib']

In [80]:
# Step 7: Train ANN
print("Training ANN...")
y_cat = to_categorical(y)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

ann = Sequential([
    Dense(256, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')
])

Training ANN...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [81]:
ann.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
ann.fit(X_train, y_train_cat, epochs=30, batch_size=64, validation_data=(X_test, y_test_cat))
ann.save("models/ann_model.h5")

print("Training complete.")

Epoch 1/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8383 - loss: 0.9765 - val_accuracy: 1.0000 - val_loss: 0.0011
Epoch 2/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0036 - val_accuracy: 1.0000 - val_loss: 2.0940e-04
Epoch 3/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0012 - val_accuracy: 1.0000 - val_loss: 7.6578e-05
Epoch 4/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 6.1369e-04 - val_accuracy: 1.0000 - val_loss: 3.6473e-05
Epoch 5/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 3.4732e-04 - val_accuracy: 1.0000 - val_loss: 2.0330e-05
Epoch 6/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 2.8832e-04 - val_accuracy: 1.0000 - val_loss: 1.2124



Training complete.
