In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define the file path
file_path = "/kaggle/input/undergrad/Undergraduate.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Artificial Neural Network (ANN) Classifier
ann_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
ann_classifier.fit(X_train_scaled, y_train)
ann_predictions = ann_classifier.predict(X_test_scaled)

# Machine Learning Classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

for name, classifier in classifiers.items():
    classifier.fit(X_train_scaled, y_train)
    predictions = classifier.predict(X_test_scaled)
    
    # Evaluate the classifier
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, target_names=le.classes_)  # Set target names using label encoder classes_
    
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Classification Report:\n{report}\n{'='*40}")

    



Classifier: Random Forest
Accuracy: 0.88
Classification Report:
                    precision    recall  f1-score   support

Private University       0.85      0.94      0.89        49
 Public University       0.92      0.81      0.86        42

          accuracy                           0.88        91
         macro avg       0.89      0.87      0.88        91
      weighted avg       0.88      0.88      0.88        91

Classifier: SVM
Accuracy: 0.75
Classification Report:
                    precision    recall  f1-score   support

Private University       0.78      0.73      0.76        49
 Public University       0.71      0.76      0.74        42

          accuracy                           0.75        91
         macro avg       0.75      0.75      0.75        91
      weighted avg       0.75      0.75      0.75        91



In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Map numerical labels back to class names
class_names = le.classes_

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForestClassifier
param_grid_rf = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

# Create a pipeline for preprocessing and RandomForestClassifier
pipeline_rf = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Get the best parameters
best_params_rf = grid_search_rf.best_params_
print(f"Best Parameters for RandomForestClassifier: {best_params_rf}")

# Evaluate the RandomForestClassifier with the best parameters
best_rf_classifier = grid_search_rf.best_estimator_
best_rf_predictions = best_rf_classifier.predict(X_test)
best_rf_accuracy = accuracy_score(y_test, best_rf_predictions)
best_rf_report = classification_report(y_test, best_rf_predictions, target_names=class_names)

print("Random Forest Classifier:")
print(f"Accuracy: {best_rf_accuracy:.2f}")
print(f"Classification Report:\n{best_rf_report}\n{'='*40}")



Best Parameters for RandomForestClassifier: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 100}
Random Forest Classifier:
Accuracy: 0.90
Classification Report:
                    precision    recall  f1-score   support

Private University       0.88      0.94      0.91        49
 Public University       0.92      0.86      0.89        42

          accuracy                           0.90        91
         macro avg       0.90      0.90      0.90        91
      weighted avg       0.90      0.90      0.90        91



In [14]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Print the total number of samples for each class
class_counts = balanced_df['Name of your current institution? '].value_counts()
print("Total number of samples after downsampling:")
for class_name, count in class_counts.items():
    print(f"{class_name}: {count}")



Total number of samples after downsampling:
Public University: 226
Private University: 226


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# Hyperparameter tuning for SVM
param_grid_svm = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto']
}

# Hyperparameter tuning for MLPClassifier
param_grid_mlp = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100, 50), (100, 50, 20)],
    'mlpclassifier__max_iter': [200, 500, 1000],
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01]
}

# Hyperparameter tuning for DecisionTreeClassifier
param_grid_dt = {
    'decisiontreeclassifier__max_depth': [None, 5, 10, 20],
    'decisiontreeclassifier__min_samples_split': [2, 5, 10],
    'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning for LogisticRegression
param_grid_lr = {
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__max_iter': [50, 100, 200]
}

# Hyperparameter tuning for GaussianNB (No hyperparameters to tune)

# Hyperparameter tuning for KNeighborsClassifier
param_grid_knn = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7],
    'kneighborsclassifier__weights': ['uniform', 'distance']
}

# Hyperparameter tuning for AdaBoostClassifier
param_grid_adaboost = {
    'adaboostclassifier__n_estimators': [50, 100, 200],
    'adaboostclassifier__learning_rate': [0.1, 0.5, 1]
}

# Hyperparameter tuning for GradientBoostingClassifier
param_grid_gb = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5],
    'gradientboostingclassifier__max_depth': [3, 5, 10]
}

# Create pipelines for each classifier

pipeline_svm = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    SVC(random_state=42)
)

pipeline_mlp = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MLPClassifier(random_state=42)
)

pipeline_dt = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    DecisionTreeClassifier(random_state=42)
)

pipeline_lr = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LogisticRegression(random_state=42)
)

pipeline_nb = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    GaussianNB()
)

pipeline_knn = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    KNeighborsClassifier()
)

pipeline_adaboost = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    AdaBoostClassifier(random_state=42)
)

pipeline_gb = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    GradientBoostingClassifier(random_state=42)
)

# Create dictionaries for classifiers and their respective hyperparameter grids
classifiers = {
    'SVM': (pipeline_svm, param_grid_svm),
    'MLPClassifier': (pipeline_mlp, param_grid_mlp),
    'Decision Tree': (pipeline_dt, param_grid_dt),
    'Logistic Regression': (pipeline_lr, param_grid_lr),
    'Naive Bayes': (pipeline_nb, {}),
    'K-Nearest Neighbors': (pipeline_knn, param_grid_knn),
    'AdaBoost': (pipeline_adaboost, param_grid_adaboost),
    'Gradient Boosting': (pipeline_gb, param_grid_gb)
}

# Loop through classifiers
for name, (pipeline, param_grid) in classifiers.items():
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best Parameters for {name}: {best_params}")

    # Evaluate the classifier with the best parameters
    best_classifier = grid_search.best_estimator_
    best_predictions = best_classifier.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_predictions)
    # Convert numeric labels back to class names
    class_names = le.classes_
    y_test_names = le.inverse_transform(y_test)
    best_predictions_names = le.inverse_transform(best_predictions)

    best_report = classification_report(y_test_names, best_predictions_names, target_names=class_names)

    print(f"{name} Classifier:")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {best_accuracy:.2f}")
    print(f"Classification Report:\n{best_report}\n{'='*40}")


Best Parameters for SVM: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
SVM Classifier:
Best Parameters: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Accuracy: 0.81
Classification Report:
                    precision    recall  f1-score   support

Private University       0.86      0.78      0.82        49
 Public University       0.77      0.86      0.81        42

          accuracy                           0.81        91
         macro avg       0.81      0.82      0.81        91
      weighted avg       0.82      0.81      0.81        91





Best Parameters for MLPClassifier: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__max_iter': 200}
MLPClassifier Classifier:
Best Parameters: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__max_iter': 200}
Accuracy: 0.80
Classification Report:
                    precision    recall  f1-score   support

Private University       0.79      0.86      0.82        49
 Public University       0.82      0.74      0.78        42

          accuracy                           0.80        91
         macro avg       0.80      0.80      0.80        91
      weighted avg       0.80      0.80      0.80        91

Best Parameters for Decision Tree: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 'decisiontreeclassifier__min_samples_split': 10}
Decision Tree Classifier:
Best Parameters: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 

In [17]:

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

# ... (previous code remains unchanged)

# Hyperparameter tuning for XGBClassifier
param_grid_xgb = {
    'xgbclassifier__n_estimators': [50, 100, 200],
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.5],
    'xgbclassifier__max_depth': [3, 5, 10]
}

# Hyperparameter tuning for LGBMClassifier
param_grid_lgbm = {
    'lgbmclassifier__n_estimators': [50, 100, 200],
    'lgbmclassifier__learning_rate': [0.01, 0.1, 0.5],
    'lgbmclassifier__max_depth': [3, 5, 10]
}

# Hyperparameter tuning for CatBoostClassifier
param_grid_catboost = {
    'catboostclassifier__n_estimators': [50, 100, 200],
    'catboostclassifier__learning_rate': [0.01, 0.1, 0.5],
    'catboostclassifier__max_depth': [3, 5, 10]
}

# Hyperparameter tuning for ExtraTreesClassifier
param_grid_extra_trees = {
    'extratreesclassifier__n_estimators': [50, 100, 200],
    'extratreesclassifier__max_depth': [None, 5, 10, 20],
    'extratreesclassifier__min_samples_split': [2, 5, 10],
    'extratreesclassifier__min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning for BaggingClassifier
param_grid_bagging = {
    'baggingclassifier__n_estimators': [50, 100, 200],
    'baggingclassifier__max_samples': [0.5, 0.7, 1.0],
    'baggingclassifier__max_features': [0.5, 0.7, 1.0]
}

# Create pipelines for each classifier

pipeline_xgb = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBClassifier(random_state=42)
)

pipeline_lgbm = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LGBMClassifier(random_state=42)
)

pipeline_catboost = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    CatBoostClassifier(random_state=42)
)

pipeline_extra_trees = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    ExtraTreesClassifier(random_state=42)
)

pipeline_bagging = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    BaggingClassifier(random_state=42)
)

# Create dictionaries for additional classifiers and their respective hyperparameter grids
additional_classifiers = {
    'XGBClassifier': (pipeline_xgb, param_grid_xgb),
    'LGBMClassifier': (pipeline_lgbm, param_grid_lgbm),
    'CatBoostClassifier': (pipeline_catboost, param_grid_catboost),
    'ExtraTreesClassifier': (pipeline_extra_trees, param_grid_extra_trees),
    'BaggingClassifier': (pipeline_bagging, param_grid_bagging)
}

# Combine the classifiers
all_classifiers = {**classifiers, **additional_classifiers}

# Loop through all classifiers
for name, (pipeline, param_grid) in all_classifiers.items():
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best Parameters for {name}: {best_params}")

    # Evaluate the classifier with the best parameters
    best_classifier = grid_search.best_estimator_
    best_predictions = best_classifier.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_predictions)
    # Convert numeric labels back to class names
    class_names = le.classes_
    y_test_names = le.inverse_transform(y_test)
    best_predictions_names = le.inverse_transform(best_predictions)

    best_report = classification_report(y_test_names, best_predictions_names, target_names=class_names)

    print(f"{name} Classifier:")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {best_accuracy:.2f}")
    print(f"Classification Report:\n{best_report}\n{'='*40}")


Best Parameters for SVM: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
SVM Classifier:
Best Parameters: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Accuracy: 0.81
Classification Report:
                    precision    recall  f1-score   support

Private University       0.86      0.78      0.82        49
 Public University       0.77      0.86      0.81        42

          accuracy                           0.81        91
         macro avg       0.81      0.82      0.81        91
      weighted avg       0.82      0.81      0.81        91





Best Parameters for MLPClassifier: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__max_iter': 200}
MLPClassifier Classifier:
Best Parameters: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__max_iter': 200}
Accuracy: 0.80
Classification Report:
                    precision    recall  f1-score   support

Private University       0.79      0.86      0.82        49
 Public University       0.82      0.74      0.78        42

          accuracy                           0.80        91
         macro avg       0.80      0.80      0.80        91
      weighted avg       0.80      0.80      0.80        91

Best Parameters for Decision Tree: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 'decisiontreeclassifier__min_samples_split': 10}
Decision Tree Classifier:
Best Parameters: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for MLPClassifier
param_grid_mlp = {
    'mlpclassifier__hidden_layer_sizes': [(128,), (64,), (32,)],
    'mlpclassifier__activation': ['relu'],
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01],
    'mlpclassifier__batch_size': [32, 64],
    'mlpclassifier__max_iter': [500],
    'mlpclassifier__early_stopping': [True],
    'mlpclassifier__n_iter_no_change': [10],
}

# Create a pipeline for preprocessing and MLPClassifier
pipeline_mlp = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MLPClassifier(random_state=42)
)

grid_search_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp, cv=5, scoring='accuracy')
grid_search_mlp.fit(X_train, y_train)

# Get the best parameters
best_params_mlp = grid_search_mlp.best_params_
print(f"Best Parameters for MLPClassifier: {best_params_mlp}")

# Evaluate the MLPClassifier with the best parameters
best_mlp_classifier = grid_search_mlp.best_estimator_
best_mlp_predictions = best_mlp_classifier.predict(X_test)
best_mlp_accuracy = accuracy_score(y_test, best_mlp_predictions)
# Convert numeric labels back to class names
class_names = le.classes_
y_test_names = le.inverse_transform(y_test)
best_mlp_predictions_names = le.inverse_transform(best_mlp_predictions)
best_mlp_report = classification_report(y_test_names, best_mlp_predictions_names, target_names=class_names)

print("MLP Classifier:")
print(f"Best Parameters: {best_params_mlp}")
print(f"Accuracy: {best_mlp_accuracy:.2f}")
print(f"Classification Report:\n{best_mlp_report}\n{'='*40}")


Best Parameters for MLPClassifier: {'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.001, 'mlpclassifier__batch_size': 32, 'mlpclassifier__early_stopping': True, 'mlpclassifier__hidden_layer_sizes': (128,), 'mlpclassifier__max_iter': 500, 'mlpclassifier__n_iter_no_change': 10}
MLP Classifier:
Best Parameters: {'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.001, 'mlpclassifier__batch_size': 32, 'mlpclassifier__early_stopping': True, 'mlpclassifier__hidden_layer_sizes': (128,), 'mlpclassifier__max_iter': 500, 'mlpclassifier__n_iter_no_change': 10}
Accuracy: 0.77
Classification Report:
                    precision    recall  f1-score   support

Private University       0.78      0.80      0.79        49
 Public University       0.76      0.74      0.75        42

          accuracy                           0.77        91
         macro avg       0.77      0.77      0.77        91
      weighted avg       0.77      0.77      0.77        91



In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a more complex ANN model with Dropout and BatchNormalization
ann_model = Sequential()
ann_model.add(Dense(units=128, activation='relu', input_dim=X_train_scaled.shape[1]))
ann_model.add(Dropout(0.5))  # Increase dropout rate
ann_model.add(Dense(units=64, activation='relu'))
ann_model.add(BatchNormalization())
ann_model.add(Dense(units=32, activation='relu'))
ann_model.add(Dropout(0.3))  # Add another dropout layer
ann_model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with a lower learning rate
ann_model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the ANN model and get history
history = ann_model.fit(X_train_scaled, y_train, epochs=70, batch_size=16, validation_data=(X_test_scaled, y_test), verbose=1)

# Predictions using the ANN
y_pred_ann = (ann_model.predict(X_test_scaled) > 0.5).astype("int32").flatten()

# Evaluate the ANN
accuracy = accuracy_score(y_test, y_pred_ann)
print('For ANN, the accuracy is= ', accuracy)


Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
For ANN, the accuracy is=  0.7802197802197802


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a more complex ANN model with Dropout and BatchNormalization
ann_model = Sequential()
ann_model.add(Dense(units=512, activation='relu', input_dim=X_train_scaled.shape[1]))
ann_model.add(BatchNormalization())
ann_model.add(Dropout(0.5))
ann_model.add(Dense(units=256, activation='relu'))
ann_model.add(BatchNormalization())
ann_model.add(Dropout(0.4))
ann_model.add(Dense(units=128, activation='relu'))
ann_model.add(Dropout(0.3))
ann_model.add(Dense(units=64, activation='relu'))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(units=1, activation='sigmoid'))

# Implement learning rate scheduling
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0001,
    decay_steps=1000,
    decay_rate=0.9
)
ann_model.compile(optimizer=Adam(learning_rate=lr_schedule), loss='binary_crossentropy', metrics=['accuracy'])

# Train the ANN model and get history
history = ann_model.fit(X_train_scaled, y_train, epochs=70, batch_size=16, validation_data=(X_test_scaled, y_test))

# Predictions using the ANN
y_pred_ann = (ann_model.predict(X_test_scaled) > 0.5).astype("int32").flatten()

# Evaluate the ANN
print('For ANN:')
print('Accuracy:', accuracy_score(y_test, y_pred_ann))
print('Classification Report:')
print(classification_report(y_test, y_pred_ann))


Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
For ANN:
Accuracy: 0.7802197802197802
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        49

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Define the file path
file_path = "/kaggle/input/undergrad/Undergraduate.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Convert the target column to numeric, handling errors by setting them to NaN
balanced_df['What was your SSC GPA?'] = pd.to_numeric(balanced_df['What was your SSC GPA?'], errors='coerce')

# Drop rows with NaN values in the target column
balanced_df = balanced_df.dropna(subset=['What was your SSC GPA?'])

# Separate features (X) and target variable (y)
target_column = 'What was your SSC GPA?'
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended for linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Initialize and train the decision tree regressor model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_scaled, y_train)

# Initialize and train the random forest regressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)

# Initialize and train the gradient boosting regressor model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train_scaled, y_train)

# Initialize and train the support vector regressor model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)

# Initialize and train the k-nearest neighbors regressor model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_scaled, y_train)

# Initialize and train the lasso regression model
lasso_model = Lasso()
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set for each model
linear_pred = linear_model.predict(X_test_scaled)
dt_pred = dt_model.predict(X_test_scaled)
rf_pred = rf_model.predict(X_test_scaled)
gb_pred = gb_model.predict(X_test_scaled)
svr_pred = svr_model.predict(X_test_scaled)
knn_pred = knn_model.predict(X_test_scaled)
lasso_pred = lasso_model.predict(X_test_scaled)

# Evaluate each model
linear_mse = mean_squared_error(y_test, linear_pred)
linear_r2 = r2_score(y_test, linear_pred)

dt_mse = mean_squared_error(y_test, dt_pred)
dt_r2 = r2_score(y_test, dt_pred)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

gb_mse = mean_squared_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)

svr_mse = mean_squared_error(y_test, svr_pred)
svr_r2 = r2_score(y_test, svr_pred)

knn_mse = mean_squared_error(y_test, knn_pred)
knn_r2 = r2_score(y_test, knn_pred)

lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

# Print the results
print(f'Linear Regression - Mean Squared Error: {linear_mse}, R-squared: {linear_r2}')
print(f'Decision Tree Regressor - Mean Squared Error: {dt_mse}, R-squared: {dt_r2}')
print(f'Random Forest Regressor - Mean Squared Error: {rf_mse}, R-squared: {rf_r2}')
print(f'Gradient Boosting Regressor - Mean Squared Error: {gb_mse}, R-squared: {gb_r2}')
print(f'Support Vector Regressor - Mean Squared Error: {svr_mse}, R-squared: {svr_r2}')
print(f'K-Nearest Neighbors Regressor - Mean Squared Error: {knn_mse}, R-squared: {knn_r2}')
print(f'Lasso Regression - Mean Squared Error: {lasso_mse}, R-squared: {lasso_r2}')


Linear Regression - Mean Squared Error: 0.056248060161887964, R-squared: -0.08982924603798303
Decision Tree Regressor - Mean Squared Error: 0.04907582417582417, R-squared: 0.04913576919279339
Random Forest Regressor - Mean Squared Error: 0.03486187879120865, R-squared: 0.3245367934620125
Gradient Boosting Regressor - Mean Squared Error: 0.028810718909370165, R-squared: 0.4417804991595775
Support Vector Regressor - Mean Squared Error: 0.033990545675827044, R-squared: 0.34141923010873765
K-Nearest Neighbors Regressor - Mean Squared Error: 0.0535215824175824, R-squared: -0.03700262097991214
Lasso Regression - Mean Squared Error: 0.051768541878544856, R-squared: -0.003036741206731808


In [56]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Downsample both Public University and Private University to 226 instances each
public_university = df[df['Name of your current institution? '] == 'Public University'].sample(226, random_state=42)
private_university = df[df['Name of your current institution? '] == 'Private University'].sample(226, random_state=42, replace=True)
balanced_df = pd.concat([public_university, private_university])

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)  # Assuming y contains class labels 0 and 1

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Data preprocessing
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# ELM implementation
class ELMClassifierManual:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.input_weights = np.random.rand(input_size, hidden_size)
        self.bias = np.random.rand(hidden_size)
        self.output_weights = None

    def train(self, X, y):
        hidden_layer_output = self.sigmoid(X.dot(self.input_weights) + self.bias)
        self.output_weights = np.linalg.pinv(hidden_layer_output).dot(y)

    def predict(self, X):
        hidden_layer_output = self.sigmoid(X.dot(self.input_weights) + self.bias)
        predictions = hidden_layer_output.dot(self.output_weights)
        return np.round(predictions).astype(int)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

# Initialize ELMClassifier
elm_classifier = ELMClassifierManual(input_size=X_train_scaled.shape[1], hidden_size=100)

# Train ELMClassifier
elm_classifier.train(X_train_scaled, y_train)

# Predictions
elm_predictions = elm_classifier.predict(X_test_scaled)

# Evaluate ELMClassifier
elm_accuracy = accuracy_score(y_test, elm_predictions)
elm_report = classification_report(y_test, elm_predictions)

print("ELM Classifier (Manual Implementation):")
print(f"Accuracy: {elm_accuracy:.2f}")
print(f"Classification Report:\n{elm_report}\n{'='*40}")


ELM Classifier (Manual Implementation):
Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78        49
           1       0.76      0.83      0.80        42
           2       0.00      0.00      0.00         0

    accuracy                           0.78        91
   macro avg       0.53      0.52      0.53        91
weighted avg       0.80      0.78      0.79        91



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
from sklearn.ensemble import VotingClassifier

# Define the best classifiers based on the results of hyperparameter tuning
best_svm = classifiers['SVM'][0]
best_svm_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('svc__')]
for key in best_svm_keys:
    setattr(best_svm.named_steps['svc'], key, grid_search.best_params_['svc__' + key])

best_mlp = classifiers['MLPClassifier'][0]
best_mlp_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('mlpclassifier__')]
for key in best_mlp_keys:
    setattr(best_mlp.named_steps['mlpclassifier'], key, grid_search.best_params_['mlpclassifier__' + key])

best_dt = classifiers['Decision Tree'][0]
best_dt_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('decisiontreeclassifier__')]
for key in best_dt_keys:
    setattr(best_dt.named_steps['decisiontreeclassifier'], key, grid_search.best_params_['decisiontreeclassifier__' + key])

best_lr = classifiers['Logistic Regression'][0]
best_lr_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('logisticregression__')]
for key in best_lr_keys:
    setattr(best_lr.named_steps['logisticregression'], key, grid_search.best_params_['logisticregression__' + key])

best_knn = classifiers['K-Nearest Neighbors'][0]
best_knn_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('kneighborsclassifier__')]
for key in best_knn_keys:
    setattr(best_knn.named_steps['kneighborsclassifier'], key, grid_search.best_params_['kneighborsclassifier__' + key])

best_adaboost = classifiers['AdaBoost'][0]
best_adaboost_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('adaboostclassifier__')]
for key in best_adaboost_keys:
    setattr(best_adaboost.named_steps['adaboostclassifier'], key, grid_search.best_params_['adaboostclassifier__' + key])

best_gb = classifiers['Gradient Boosting'][0]
best_gb_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('gradientboostingclassifier__')]
for key in best_gb_keys:
    setattr(best_gb.named_steps['gradientboostingclassifier'], key, grid_search.best_params_['gradientboostingclassifier__' + key])

# Create an ensemble classifier using a VotingClassifier
ensemble_classifier = VotingClassifier(
    estimators=[
        ('SVM', best_svm),
        ('MLPClassifier', best_mlp),
        ('Decision Tree', best_dt),
        ('Logistic Regression', best_lr),
        ('K-Nearest Neighbors', best_knn),
        ('AdaBoost', best_adaboost),
        ('Gradient Boosting', best_gb)
    ],
    voting='hard'  # You can change to 'soft' if classifiers provide probability estimates
)

# Train the ensemble classifier on the training data
ensemble_classifier.fit(X_train, y_train)


# Evaluate the ensemble classifier on the test data
ensemble_predictions = ensemble_classifier.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)

# Convert numeric labels back to class names
ensemble_predictions_names = le.inverse_transform(ensemble_predictions)
ensemble_report = classification_report(y_test_names, ensemble_predictions_names, target_names=class_names)

print("Ensemble Classifier:")
print(f"Accuracy: {ensemble_accuracy:.2f}")
print(f"Classification Report:\n{ensemble_report}")





Ensemble Classifier:
Accuracy: 0.80
Classification Report:
                    precision    recall  f1-score   support

Private University       0.82      0.82      0.82        49
 Public University       0.79      0.79      0.79        42

          accuracy                           0.80        91
         macro avg       0.80      0.80      0.80        91
      weighted avg       0.80      0.80      0.80        91

