In [1]:
# Import necessary libraries for data handling and modeling
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = 'data_program_recommendation - Copy (tryy1).csv'  # Change this if your file is elsewhere
df = pd.read_csv(file_path)

# Display first 5 rows to understand the data structure
print("Dataset preview:")
print(df.head())

# Display info to check for missing values and data types
print("\nDataset info:")
df.info()



Dataset preview:
  student id  dost_exam_result  filipino grade  English grade  \
0  student 1                 0              89             90   
1  student 2                 0              89             79   
2  student 3                 0              84             90   
3  student 4                 0              87             90   
4  student 5                 0              82             90   

   mathematics grade  science grade  araling panlipunan grade  \
0                 78             86                        87   
1                 89             81                        79   
2                 79             90                        88   
3                 79             87                        79   
4                 82             75                        80   

   Edukasyon sa pagpapakatao grade  \
0                               83   
1                               82   
2                               90   
3                               83   
4          

In [2]:
# Select feature columns (same as used in your previous model)
feature_columns = ['dost_exam_result', 'filipino grade', 'English grade', 'mathematics grade',
                   'science grade', 'araling panlipunan grade', 'Edukasyon sa pagpapakatao grade',
                   'Edukasyong panglipunan at pangkabuhayan grade', 'MAPEH grade', 'Average grade']

X = df[feature_columns]

# Check for missing values in features
print("\nMissing values in features:")
print(X.isnull().sum())



Missing values in features:
dost_exam_result                                 0
filipino grade                                   0
English grade                                    0
mathematics grade                                0
science grade                                    0
araling panlipunan grade                         0
Edukasyon sa pagpapakatao grade                  0
Edukasyong panglipunan at pangkabuhayan grade    0
MAPEH grade                                      0
Average grade                                    0
dtype: int64


In [3]:
# Split data into training (80%) and testing (20%) sets
# Since Isolation Forest is unsupervised, we only use X (features)
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")



Training set size: 544 samples
Testing set size: 136 samples


In [4]:
# Split data into training (80%) and testing (20%) sets
# Since Isolation Forest is unsupervised, we only use X (features)
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")



Training set size: 544 samples
Testing set size: 136 samples


In [5]:
# Initialize the Isolation Forest model
# You can tune parameters like n_estimators (number of trees), contamination (expected anomaly fraction)
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Train the model on the training data
iso_forest.fit(X_train)

print("\nIsolation Forest model trained successfully.")



Isolation Forest model trained successfully.


In [6]:
# Predict anomalies on the test set
# The predict method returns 1 for normal, -1 for anomaly
test_predictions = iso_forest.predict(X_test)

# Convert predictions to 0 (normal) and 1 (anomaly) for clarity
test_predictions_binary = np.where(test_predictions == -1, 1, 0)

print("\nSample predictions on test data (1=anomaly, 0=normal):")
print(test_predictions_binary[:20])



Sample predictions on test data (1=anomaly, 0=normal):
[0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0]


In [7]:
# Check if anomaly label exists
if 'anomaly_label' in df.columns:
    # Extract true labels for test set indices
    y = df['anomaly_label']
    _, y_test = train_test_split(y, test_size=0.2, random_state=42)

    print("\nClassification Report on Test Data:")
    print(classification_report(y_test, test_predictions_binary))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, test_predictions_binary))
else:
    print("\nNo ground truth anomaly labels found in dataset. Skipping evaluation.")



No ground truth anomaly labels found in dataset. Skipping evaluation.


In [8]:
# Add predictions to test set DataFrame for inspection
X_test_with_pred = X_test.copy()
X_test_with_pred['anomaly'] = test_predictions_binary

# Show anomalies detected
print("\nAnomalies detected in test set:")
print(X_test_with_pred[X_test_with_pred['anomaly'] == 1])



Anomalies detected in test set:
     dost_exam_result  filipino grade  English grade  mathematics grade  \
101                 1              89             75                 85   
208                 0              88             75                 76   
55                  0              88             87                 87   
76                  0              83             88                 81   
90                  0              84             86                 75   
164                 1              88             85                 96   
109                 1              90             75                 82   
65                  0              82             89                 82   
145                 1              97             88                 78   
108                 1              82             93                 97   
133                 1              99             95                 89   
148                 1              84             96               