In [42]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Assuming the last column is the target variable
target_column = df.columns[-1]

# Remove leading and trailing spaces from the target column
df[target_column] = df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)  # Assuming y contains class labels 0 and 1

# Exclude instances where the true class is -1
valid_indices = (y != -1)
X_encoded = X_encoded[valid_indices]
y = y[valid_indices]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Data preprocessing
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# ELM implementation
class ELMClassifierManual:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.input_weights = np.random.rand(input_size, hidden_size)
        self.bias = np.random.rand(hidden_size)
        self.output_weights = None

    def train(self, X, y):
        hidden_layer_output = self.sigmoid(X.dot(self.input_weights) + self.bias)
        self.output_weights = np.linalg.pinv(hidden_layer_output).dot(y)

    def predict(self, X):
        hidden_layer_output = self.sigmoid(X.dot(self.input_weights) + self.bias)
        predictions = hidden_layer_output.dot(self.output_weights)
        return np.round(predictions).astype(int)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

# Initialize ELMClassifier
elm_classifier = ELMClassifierManual(input_size=X_train_scaled.shape[1], hidden_size=100)

# Train ELMClassifier
elm_classifier.train(X_train_scaled, y_train)

# Predictions
elm_predictions = elm_classifier.predict(X_test_scaled)

# Exclude instances where the true class is -1 during evaluation
valid_test_indices = (y_test != -1)
y_test_valid = y_test[valid_test_indices]
elm_predictions_valid = elm_predictions[valid_test_indices]

# Evaluate ELMClassifier
elm_accuracy = accuracy_score(y_test_valid, elm_predictions_valid)
elm_report = classification_report(y_test_valid, elm_predictions_valid)

print("ELM Classifier (Manual Implementation):")
print(f"Accuracy: {elm_accuracy:.2f}")
print(f"Classification Report:\n{elm_report}\n{'='*40}")


ELM Classifier (Manual Implementation):
Accuracy: 0.83
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.69      0.73        39
           1       0.86      0.90      0.88        80

    accuracy                           0.83       119
   macro avg       0.81      0.80      0.80       119
weighted avg       0.83      0.83      0.83       119



In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Assuming the last column is the target variable
target_column = df.columns[-1]

# Remove leading and trailing spaces from the target column
df[target_column] = df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# Hyperparameter tuning for SVM
param_grid_svm = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto']
}

# Hyperparameter tuning for MLPClassifier
param_grid_mlp = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100, 50), (100, 50, 20)],
    'mlpclassifier__max_iter': [200, 500, 1000],
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01]
}

# Hyperparameter tuning for DecisionTreeClassifier
param_grid_dt = {
    'decisiontreeclassifier__max_depth': [None, 5, 10, 20],
    'decisiontreeclassifier__min_samples_split': [2, 5, 10],
    'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning for LogisticRegression
param_grid_lr = {
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__max_iter': [50, 100, 200]
}

# Hyperparameter tuning for GaussianNB (No hyperparameters to tune)

# Hyperparameter tuning for KNeighborsClassifier
param_grid_knn = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7],
    'kneighborsclassifier__weights': ['uniform', 'distance']
}

# Hyperparameter tuning for AdaBoostClassifier
param_grid_adaboost = {
    'adaboostclassifier__n_estimators': [50, 100, 200],
    'adaboostclassifier__learning_rate': [0.1, 0.5, 1]
}

# Hyperparameter tuning for GradientBoostingClassifier
param_grid_gb = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5],
    'gradientboostingclassifier__max_depth': [3, 5, 10]
}

# Create pipelines for each classifier

pipeline_svm = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    SVC(random_state=42)
)

pipeline_mlp = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MLPClassifier(random_state=42)
)

pipeline_dt = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    DecisionTreeClassifier(random_state=42)
)

pipeline_lr = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LogisticRegression(random_state=42)
)

pipeline_nb = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    GaussianNB()
)

pipeline_knn = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    KNeighborsClassifier()
)

pipeline_adaboost = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    AdaBoostClassifier(random_state=42)
)

pipeline_gb = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    GradientBoostingClassifier(random_state=42)
)

# Create dictionaries for classifiers and their respective hyperparameter grids
classifiers = {
    'SVM': (pipeline_svm, param_grid_svm),
    'MLPClassifier': (pipeline_mlp, param_grid_mlp),
    'Decision Tree': (pipeline_dt, param_grid_dt),
    'Logistic Regression': (pipeline_lr, param_grid_lr),
    'Naive Bayes': (pipeline_nb, {}),
    'K-Nearest Neighbors': (pipeline_knn, param_grid_knn),
    'AdaBoost': (pipeline_adaboost, param_grid_adaboost),
    'Gradient Boosting': (pipeline_gb, param_grid_gb)
}

# Loop through classifiers
for name, (pipeline, param_grid) in classifiers.items():
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best Parameters for {name}: {best_params}")

    # Evaluate the classifier with the best parameters
    best_classifier = grid_search.best_estimator_
    best_predictions = best_classifier.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_predictions)
    # Convert numeric labels back to class names
    class_names = le.classes_
    y_test_names = le.inverse_transform(y_test)
    best_predictions_names = le.inverse_transform(best_predictions)

    best_report = classification_report(y_test_names, best_predictions_names, target_names=class_names)

    print(f"{name} Classifier:")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {best_accuracy:.2f}")
    print(f"Classification Report:\n{best_report}\n{'='*40}")

Best Parameters for SVM: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
SVM Classifier:
Best Parameters: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Accuracy: 0.84
Classification Report:
                    precision    recall  f1-score   support

Private University       0.78      0.72      0.75        39
 Public University       0.87      0.90      0.88        80

          accuracy                           0.84       119
         macro avg       0.82      0.81      0.82       119
      weighted avg       0.84      0.84      0.84       119





Best Parameters for MLPClassifier: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__max_iter': 200}
MLPClassifier Classifier:
Best Parameters: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__max_iter': 200}
Accuracy: 0.82
Classification Report:
                    precision    recall  f1-score   support

Private University       0.70      0.77      0.73        39
 Public University       0.88      0.84      0.86        80

          accuracy                           0.82       119
         macro avg       0.79      0.80      0.80       119
      weighted avg       0.82      0.82      0.82       119

Best Parameters for Decision Tree: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 'decisiontreeclassifier__min_samples_split': 10}
Decision Tree Classifier:
Best Parameters: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 

In [58]:
from sklearn.ensemble import VotingClassifier

# Define the best classifiers based on the results of hyperparameter tuning
best_svm = classifiers['SVM'][0]
best_svm_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('svc__')]
for key in best_svm_keys:
    setattr(best_svm.named_steps['svc'], key, grid_search.best_params_['svc__' + key])

best_mlp = classifiers['MLPClassifier'][0]
best_mlp_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('mlpclassifier__')]
for key in best_mlp_keys:
    setattr(best_mlp.named_steps['mlpclassifier'], key, grid_search.best_params_['mlpclassifier__' + key])

best_dt = classifiers['Decision Tree'][0]
best_dt_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('decisiontreeclassifier__')]
for key in best_dt_keys:
    setattr(best_dt.named_steps['decisiontreeclassifier'], key, grid_search.best_params_['decisiontreeclassifier__' + key])

best_lr = classifiers['Logistic Regression'][0]
best_lr_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('logisticregression__')]
for key in best_lr_keys:
    setattr(best_lr.named_steps['logisticregression'], key, grid_search.best_params_['logisticregression__' + key])

best_knn = classifiers['K-Nearest Neighbors'][0]
best_knn_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('kneighborsclassifier__')]
for key in best_knn_keys:
    setattr(best_knn.named_steps['kneighborsclassifier'], key, grid_search.best_params_['kneighborsclassifier__' + key])

best_adaboost = classifiers['AdaBoost'][0]
best_adaboost_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('adaboostclassifier__')]
for key in best_adaboost_keys:
    setattr(best_adaboost.named_steps['adaboostclassifier'], key, grid_search.best_params_['adaboostclassifier__' + key])

best_gb = classifiers['Gradient Boosting'][0]
best_gb_keys = [key.split('__')[1] for key in grid_search.best_params_.keys() if key.startswith('gradientboostingclassifier__')]
for key in best_gb_keys:
    setattr(best_gb.named_steps['gradientboostingclassifier'], key, grid_search.best_params_['gradientboostingclassifier__' + key])

# Create an ensemble classifier using a VotingClassifier
ensemble_classifier = VotingClassifier(
    estimators=[
        ('SVM', best_svm),
        ('MLPClassifier', best_mlp),
        ('Decision Tree', best_dt),
        ('Logistic Regression', best_lr),
        ('K-Nearest Neighbors', best_knn),
        ('AdaBoost', best_adaboost),
        ('Gradient Boosting', best_gb)
    ],
    voting='hard'  # You can change to 'soft' if classifiers provide probability estimates
)

# Train the ensemble classifier on the training data
ensemble_classifier.fit(X_train, y_train)


# Evaluate the ensemble classifier on the test data
ensemble_predictions = ensemble_classifier.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)

# Convert numeric labels back to class names
ensemble_predictions_names = le.inverse_transform(ensemble_predictions)
ensemble_report = classification_report(y_test_names, ensemble_predictions_names, target_names=class_names)

print("Ensemble Classifier:")
print(f"Accuracy: {ensemble_accuracy:.2f}")
print(f"Classification Report:\n{ensemble_report}")





Ensemble Classifier:
Accuracy: 0.83
Classification Report:
                    precision    recall  f1-score   support

Private University       0.74      0.74      0.74        39
 Public University       0.88      0.88      0.88        80

          accuracy                           0.83       119
         macro avg       0.81      0.81      0.81       119
      weighted avg       0.83      0.83      0.83       119

