In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, top_k_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('new_data.csv')
print(df.head())
print(df.columns)

In [None]:
print(df.shape)

In [None]:
print(df.dtypes)

In [None]:
print(df.isnull().sum().sum())

In [None]:
df['Courses'].unique()

In [None]:
# Count values in the 'Courses' column
course_counts = df['Courses'].value_counts()

# Print counts and percentages
print("Course distribution:\n")
print(course_counts)
print("\nPercentage distribution:\n")
print(course_counts / len(df) * 100)

# Visualize the distribution
plt.figure(figsize=(12, 6))
course_counts.plot(kind='bar')
plt.title('Distribution of Course Labels')
plt.xlabel('Course')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [35]:
# Separate features and target
X = df.drop(columns=['Courses'])
y = df['Courses']

# Encode target labels (e.g., strings to integers)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Random Forest

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Initialize and train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# feature importance
importances = pd.Series(clf.feature_importances_, index=X.columns)
importances.sort_values().tail(10).plot(kind='barh', title='Top 10 Important Features')
plt.show()


# XGBoost

In [None]:
import xgboost as xgb

# Train XGBoost
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Predict and report
y_pred = xgb_model.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


# Tensorflow neural networks

In [39]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [44]:
# One-hot encode target
y_categorical = to_categorical(y_encoded)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

# Build model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(le.classes_), activation='softmax')
])

# Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_split=0.1)

# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Neural Net Test Accuracy: {test_acc:.4f}")

# Predict & Report
y_pred_nn = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)

print("Neural Net Classification Report:")
print(classification_report(y_true, y_pred_nn, target_names=le.classes_))


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.2762 - loss: 3.1335 - val_accuracy: 0.9541 - val_loss: 0.8919
Epoch 2/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8659 - loss: 0.8229 - val_accuracy: 0.9965 - val_loss: 0.1401
Epoch 3/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9391 - loss: 0.2850 - val_accuracy: 0.9965 - val_loss: 0.0568
Epoch 4/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9689 - loss: 0.1667 - val_accuracy: 0.9965 - val_loss: 0.0313
Epoch 5/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9780 - loss: 0.1024 - val_accuracy: 0.9965 - val_loss: 0.0254
Epoch 6/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9858 - loss: 0.0731 - val_accuracy: 0.9965 - val_loss: 0.0184
Epoch 7/20
[1m160/160[0m [32m━━━━━━━

In [45]:
# Get predicted probabilities
y_pred_probs = model.predict(X_test)

# Get Top-3 Accuracy
y_true = np.argmax(y_test, axis=1)
top3_acc = top_k_accuracy_score(y_true, y_pred_probs, k=3)
print(f"Neural Net Top-3 Accuracy: {top3_acc:.4f}")

# Get predicted classes (for classification report)
y_pred_nn = np.argmax(y_pred_probs, axis=1)

print("Neural Net Classification Report:")
print(classification_report(y_true, y_pred_nn, target_names=le.classes_))

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Neural Net Top-3 Accuracy: 1.0000
Neural Net Classification Report:
                                                     precision    recall  f1-score   support

                 Animation, Graphics and Multimedia       1.00      1.00      1.00        20
                   B.Arch- Bachelor of Architecture       1.00      1.00      1.00        20
                        B.Com- Bachelor of Commerce       1.00      0.90      0.95        20
                                              B.Ed.       1.00      1.00      1.00        20
                              B.Sc- Applied Geology       1.00      1.00      1.00        20
                                      B.Sc- Nursing       1.00      1.00      1.00        20
                                    B.Sc. Chemistry       1.00      1.00      1.00        21
                                  B.Sc. Mathematics       1.00      1.00      1.00        21
                    