# Classification Models: SVM, Random Forest, and Neural Network

This notebook trains three classification models to predict RiskRating from the ASSI-A-Responses dataset.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")


## 1. Load and Explore Data


In [None]:
# Load the dataset
df = pd.read_csv('ASSI-A-Responses Labeled.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nTarget variable distribution:")
print(df['RiskRating'].value_counts())


## 2. Data Preprocessing


In [None]:
# Prepare features and target
# Drop StudentNumber as it's not a feature
X = df.drop(['StudentNumber', 'RiskRating'], axis=1)
y = df['RiskRating']

# Encode categorical variables (Gender)
label_encoder_gender = LabelEncoder()
X['Gender'] = label_encoder_gender.fit_transform(X['Gender'])

# Encode target variable
label_encoder_target = LabelEncoder()
y_encoded = label_encoder_target.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y_encoded.shape}")
print(f"\nFeature columns: {X.columns.tolist()}")
print(f"\nTarget classes: {label_encoder_target.classes_}")
print(f"\nEncoded target distribution:")
unique, counts = np.unique(y_encoded, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  Class {u} ({label_encoder_target.classes_[u]}): {c} samples")


## 3. Train/Validation/Test Split (80/10/10)


In [None]:
# First split: 80% train, 20% temp (which will be split into 10% validation and 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Second split: Split temp into 50% validation and 50% test (which gives 10% each of original)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTotal samples: {X_train.shape[0] + X_val.shape[0] + X_test.shape[0]}")


In [None]:
# Scale features for SVM and Neural Network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"Scaled training set shape: {X_train_scaled.shape}")


## 4. Model 1: Support Vector Machine (SVM)


In [None]:
# Train SVM model
print("Training SVM model...")
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_svm = svm_model.predict(X_train_scaled)
y_val_pred_svm = svm_model.predict(X_val_scaled)
y_test_pred_svm = svm_model.predict(X_test_scaled)

print("SVM model trained successfully!")


In [None]:
# SVM Evaluation Report
print("=" * 60)
print("SVM MODEL EVALUATION REPORT")
print("=" * 60)

print("\n--- Training Set Performance ---")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_svm):.4f}")
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred_svm, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_svm))

print("\n--- Validation Set Performance ---")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_svm):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_svm, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_svm))

print("\n--- Test Set Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred_svm):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_svm, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_svm))
print("=" * 60)


## 5. Model 2: Random Forest


In [None]:
# Train Random Forest model
print("Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred_rf = rf_model.predict(X_train)
y_val_pred_rf = rf_model.predict(X_val)
y_test_pred_rf = rf_model.predict(X_test)

print("Random Forest model trained successfully!")


In [None]:
# Random Forest Evaluation Report
print("=" * 60)
print("RANDOM FOREST MODEL EVALUATION REPORT")
print("=" * 60)

print("\n--- Training Set Performance ---")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred_rf, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_rf))

print("\n--- Validation Set Performance ---")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_rf, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_rf))

print("\n--- Test Set Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_rf, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_rf))
print("=" * 60)


In [None]:
# Feature importance for Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features (Random Forest):")
print(feature_importance.head(10))


## 6. Model 3: Neural Network (TensorFlow/Keras)


In [None]:
# Determine number of classes
n_classes = len(np.unique(y_train))
n_features = X_train_scaled.shape[1]

print(f"Number of features: {n_features}")
print(f"Number of classes: {n_classes}")

# Convert labels to categorical for neural network
y_train_categorical = keras.utils.to_categorical(y_train, n_classes)
y_val_categorical = keras.utils.to_categorical(y_val, n_classes)
y_test_categorical = keras.utils.to_categorical(y_test, n_classes)

print(f"Training labels shape: {y_train_categorical.shape}")


In [None]:
# Build Neural Network model
nn_model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(n_features,)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(n_classes, activation='softmax')
])

# Compile the model
nn_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
print("Neural Network Architecture:")
nn_model.summary()


In [None]:
# Train the Neural Network
print("Training Neural Network model...")
history = nn_model.fit(
    X_train_scaled, y_train_categorical,
    validation_data=(X_val_scaled, y_val_categorical),
    epochs=50,
    batch_size=32,
    verbose=1
)

print("\nNeural Network model trained successfully!")


In [None]:
# Neural Network Predictions
y_train_pred_nn_proba = nn_model.predict(X_train_scaled)
y_train_pred_nn = np.argmax(y_train_pred_nn_proba, axis=1)

y_val_pred_nn_proba = nn_model.predict(X_val_scaled)
y_val_pred_nn = np.argmax(y_val_pred_nn_proba, axis=1)

y_test_pred_nn_proba = nn_model.predict(X_test_scaled)
y_test_pred_nn = np.argmax(y_test_pred_nn_proba, axis=1)

print("Neural Network predictions generated!")


In [None]:
# Neural Network Evaluation Report
print("=" * 60)
print("NEURAL NETWORK MODEL EVALUATION REPORT")
print("=" * 60)

print("\n--- Training Set Performance ---")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_nn):.4f}")
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred_nn, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_nn))

print("\n--- Validation Set Performance ---")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_nn):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_nn, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_nn))

print("\n--- Test Set Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred_nn):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_nn, 
                          target_names=label_encoder_target.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_nn))
print("=" * 60)


In [None]:
# Plot training history for Neural Network
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


## 7. Model Comparison Summary


In [None]:
# Compare all models on test set
print("=" * 60)
print("MODEL COMPARISON - TEST SET PERFORMANCE")
print("=" * 60)

models = {
    'SVM': y_test_pred_svm,
    'Random Forest': y_test_pred_rf,
    'Neural Network': y_test_pred_nn
}

comparison_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Test Accuracy': [accuracy_score(y_test, pred) for pred in models.values()]
})

print("\nTest Set Accuracy Comparison:")
print(comparison_df.to_string(index=False))

print("\n" + "=" * 60)
print("Best Model:", comparison_df.loc[comparison_df['Test Accuracy'].idxmax(), 'Model'])
print("Best Accuracy:", f"{comparison_df['Test Accuracy'].max():.4f}")
print("=" * 60)
