In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

# Define the file path
file_path = "/kaggle/input/undergrad/Undergraduate.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Assuming the last column is the target variable
target_column = df.columns[-1]

# Remove leading and trailing spaces from the target column
df[target_column] = df[target_column].str.strip()

# Print count of samples for each class before oversampling
print("Count of samples for each class before oversampling:")
print(df['Name of your current institution? '].value_counts())

# Separate features (X) and target variable (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Oversample Private University to match the number of instances of Public University
oversample_ratio = df[df['Name of your current institution? '] == 'Public University'].shape[0] // df[df['Name of your current institution? '] == 'Private University'].shape[0]
ros = RandomOverSampler(sampling_strategy=oversample_ratio, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_encoded, y)

# Print count of samples for each class after oversampling
print("\nCount of samples for each class after oversampling:")
print(pd.Series(le.inverse_transform(y_resampled)).value_counts())

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Artificial Neural Network (ANN) Classifier
ann_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
ann_classifier.fit(X_train_scaled, y_train)
ann_predictions = ann_classifier.predict(X_test_scaled)

# Machine Learning Classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

for name, classifier in classifiers.items():
    classifier.fit(X_train_scaled, y_train)
    predictions = classifier.predict(X_test_scaled)
    
    # Evaluate the classifier
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, target_names=le.classes_)  # Set target names using label encoder classes_
    
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Classification Report:\n{report}\n{'='*40}")

    



Count of samples for each class before oversampling:
Public University     369
Private University    226
Name: Name of your current institution? , dtype: int64

Count of samples for each class after oversampling:
Public University     369
Private University    369
dtype: int64
Classifier: Random Forest
Accuracy: 0.77
Classification Report:
                    precision    recall  f1-score   support

Private University       0.75      0.77      0.76        69
 Public University       0.79      0.77      0.78        79

          accuracy                           0.77       148
         macro avg       0.77      0.77      0.77       148
      weighted avg       0.77      0.77      0.77       148

Classifier: SVM
Accuracy: 0.79
Classification Report:
                    precision    recall  f1-score   support

Private University       0.79      0.75      0.77        69
 Public University       0.79      0.82      0.81        79

          accuracy                           0.79       148

In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Remove leading and trailing spaces from the target column
target_column = 'Name of your current institution? '
df[target_column] = df[target_column].str.strip()

# Oversample the minority class to balance the classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=[target_column]), df[target_column])
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Map numerical labels back to class names
class_names = le.classes_

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForestClassifier
param_grid_rf = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

# Create a pipeline for preprocessing and RandomForestClassifier
pipeline_rf = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Get the best parameters
best_params_rf = grid_search_rf.best_params_
print(f"Best Parameters for RandomForestClassifier: {best_params_rf}")

# Evaluate the RandomForestClassifier with the best parameters
best_rf_classifier = grid_search_rf.best_estimator_
best_rf_predictions = best_rf_classifier.predict(X_test)
best_rf_accuracy = accuracy_score(y_test, best_rf_predictions)
best_rf_report = classification_report(y_test, best_rf_predictions, target_names=class_names)

print("Random Forest Classifier:")
print(f"Accuracy: {best_rf_accuracy:.2f}")
print(f"Classification Report:\n{best_rf_report}\n{'='*40}")



Best Parameters for RandomForestClassifier: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 50}
Random Forest Classifier:
Accuracy: 0.79
Classification Report:
                    precision    recall  f1-score   support

Private University       0.78      0.77      0.77        69
 Public University       0.80      0.81      0.81        79

          accuracy                           0.79       148
         macro avg       0.79      0.79      0.79       148
      weighted avg       0.79      0.79      0.79       148



In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Assuming the last column is the target variable
target_column = 'Name of your current institution? '

# Remove leading and trailing spaces from the target column
df[target_column] = df[target_column].str.strip()

# Oversample both Public University and Private University to balance the classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=[target_column]), df[target_column])
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for classifiers
param_grids = {
    'SVM': {
        'svc__C': [0.1, 1, 10],
        'svc__kernel': ['linear', 'rbf', 'poly'],
        'svc__gamma': ['scale', 'auto']
    },
    'MLPClassifier': {
        'mlpclassifier__hidden_layer_sizes': [(50,), (100, 50), (100, 50, 20)],
        'mlpclassifier__max_iter': [200, 500, 1000],
        'mlpclassifier__alpha': [0.0001, 0.001, 0.01]
    },
    'Decision Tree': {
        'decisiontreeclassifier__max_depth': [None, 5, 10, 20],
        'decisiontreeclassifier__min_samples_split': [2, 5, 10],
        'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]
    },
    'Logistic Regression': {
        'logisticregression__C': [0.1, 1, 10],
        'logisticregression__max_iter': [50, 100, 200]
    },
    'GaussianNB': {},  # No hyperparameters to tune
    'K-Nearest Neighbors': {
        'kneighborsclassifier__n_neighbors': [3, 5, 7],
        'kneighborsclassifier__weights': ['uniform', 'distance']
    },
    'AdaBoost': {
        'adaboostclassifier__n_estimators': [50, 100, 200],
        'adaboostclassifier__learning_rate': [0.1, 0.5, 1]
    },
    'Gradient Boosting': {
        'gradientboostingclassifier__n_estimators': [50, 100, 200],
        'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5],
        'gradientboostingclassifier__max_depth': [3, 5, 10]
    }
}

# Create pipelines for each classifier
pipelines = {
    'SVM': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), SVC(random_state=42)),
    'MLPClassifier': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), MLPClassifier(random_state=42)),
    'Decision Tree': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), DecisionTreeClassifier(random_state=42)),
    'Logistic Regression': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), LogisticRegression(random_state=42)),
    'GaussianNB': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), GaussianNB()),
    'K-Nearest Neighbors': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), KNeighborsClassifier()),
    'AdaBoost': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), AdaBoostClassifier(random_state=42)),
    'Gradient Boosting': make_pipeline(SimpleImputer(strategy='mean'), StandardScaler(), GradientBoostingClassifier(random_state=42))
}

# Loop through classifiers
for name, pipeline in pipelines.items():
    param_grid = param_grids.get(name, {})
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Evaluate the classifier with the best parameters
    best_classifier = grid_search.best_estimator_
    best_predictions = best_classifier.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_predictions)
    class_names = le.classes_
    y_test_names = le.inverse_transform(y_test)
    best_predictions_names = le.inverse_transform(best_predictions)

    best_report = classification_report(y_test_names, best_predictions_names, target_names=class_names)

    print(f"{name} Classifier:")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {best_accuracy:.2f}")
    print(f"Classification Report:\n{best_report}\n{'='*40}")



SVM Classifier:
Best Parameters: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Accuracy: 0.78
Classification Report:
                    precision    recall  f1-score   support

Private University       0.79      0.71      0.75        69
 Public University       0.77      0.84      0.80        79

          accuracy                           0.78       148
         macro avg       0.78      0.77      0.77       148
      weighted avg       0.78      0.78      0.78       148





MLPClassifier Classifier:
Best Parameters: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (100, 50), 'mlpclassifier__max_iter': 500}
Accuracy: 0.76
Classification Report:
                    precision    recall  f1-score   support

Private University       0.73      0.75      0.74        69
 Public University       0.78      0.76      0.77        79

          accuracy                           0.76       148
         macro avg       0.76      0.76      0.76       148
      weighted avg       0.76      0.76      0.76       148

Decision Tree Classifier:
Best Parameters: {'decisiontreeclassifier__max_depth': 5, 'decisiontreeclassifier__min_samples_leaf': 4, 'decisiontreeclassifier__min_samples_split': 2}
Accuracy: 0.78
Classification Report:
                    precision    recall  f1-score   support

Private University       0.78      0.74      0.76        69
 Public University       0.78      0.82      0.80        79

          accuracy                          

In [70]:

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

# ... (previous code remains unchanged)

# Hyperparameter tuning for XGBClassifier
param_grid_xgb = {
    'xgbclassifier__n_estimators': [50, 100, 200],
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.5],
    'xgbclassifier__max_depth': [3, 5, 10]
}

# Hyperparameter tuning for LGBMClassifier
param_grid_lgbm = {
    'lgbmclassifier__n_estimators': [50, 100, 200],
    'lgbmclassifier__learning_rate': [0.01, 0.1, 0.5],
    'lgbmclassifier__max_depth': [3, 5, 10]
}

# Hyperparameter tuning for CatBoostClassifier
param_grid_catboost = {
    'catboostclassifier__n_estimators': [50, 100, 200],
    'catboostclassifier__learning_rate': [0.01, 0.1, 0.5],
    'catboostclassifier__max_depth': [3, 5, 10]
}

# Hyperparameter tuning for ExtraTreesClassifier
param_grid_extra_trees = {
    'extratreesclassifier__n_estimators': [50, 100, 200],
    'extratreesclassifier__max_depth': [None, 5, 10, 20],
    'extratreesclassifier__min_samples_split': [2, 5, 10],
    'extratreesclassifier__min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning for BaggingClassifier
param_grid_bagging = {
    'baggingclassifier__n_estimators': [50, 100, 200],
    'baggingclassifier__max_samples': [0.5, 0.7, 1.0],
    'baggingclassifier__max_features': [0.5, 0.7, 1.0]
}

# Create pipelines for each classifier

pipeline_xgb = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBClassifier(random_state=42)
)

pipeline_lgbm = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LGBMClassifier(random_state=42)
)

pipeline_catboost = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    CatBoostClassifier(random_state=42)
)

pipeline_extra_trees = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    ExtraTreesClassifier(random_state=42)
)

pipeline_bagging = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    BaggingClassifier(random_state=42)
)

# Create dictionaries for additional classifiers and their respective hyperparameter grids
additional_classifiers = {
    'XGBClassifier': (pipeline_xgb, param_grid_xgb),
    'LGBMClassifier': (pipeline_lgbm, param_grid_lgbm),
    'CatBoostClassifier': (pipeline_catboost, param_grid_catboost),
    'ExtraTreesClassifier': (pipeline_extra_trees, param_grid_extra_trees),
    'BaggingClassifier': (pipeline_bagging, param_grid_bagging)
}

# Combine the classifiers
all_classifiers = {**classifiers, **additional_classifiers}

# Loop through all classifiers
for name, (pipeline, param_grid) in all_classifiers.items():
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best Parameters for {name}: {best_params}")

    # Evaluate the classifier with the best parameters
    best_classifier = grid_search.best_estimator_
    best_predictions = best_classifier.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_predictions)
    # Convert numeric labels back to class names
    class_names = le.classes_
    y_test_names = le.inverse_transform(y_test)
    best_predictions_names = le.inverse_transform(best_predictions)

    best_report = classification_report(y_test_names, best_predictions_names, target_names=class_names)

    print(f"{name} Classifier:")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {best_accuracy:.2f}")
    print(f"Classification Report:\n{best_report}\n{'='*40}")


Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
SVM Classifier:
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
Accuracy: 0.78
Classification Report:
                    precision    recall  f1-score   support

Private University       0.79      0.71      0.75        69
 Public University       0.77      0.84      0.80        79

          accuracy                           0.78       148
         macro avg       0.78      0.77      0.77       148
      weighted avg       0.78      0.78      0.78       148





Best Parameters for MLPClassifier: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 50, 20), 'max_iter': 200}
MLPClassifier Classifier:
Best Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 50, 20), 'max_iter': 200}
Accuracy: 0.76
Classification Report:
                    precision    recall  f1-score   support

Private University       0.75      0.74      0.74        69
 Public University       0.78      0.78      0.78        79

          accuracy                           0.76       148
         macro avg       0.76      0.76      0.76       148
      weighted avg       0.76      0.76      0.76       148

Best Parameters for Decision Tree: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Decision Tree Classifier:
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy: 0.78
Classification Report:
                    precision    recall  f1-score   support

Private University       0.78      0.74      0.76        69
 Public Univ

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters for Logistic Regression: {'C': 10, 'max_iter': 50}
Logistic Regression Classifier:
Best Parameters: {'C': 10, 'max_iter': 50}
Accuracy: 0.79
Classification Report:
                    precision    recall  f1-score   support

Private University       0.80      0.74      0.77        69
 Public University       0.79      0.84      0.81        79

          accuracy                           0.79       148
         macro avg       0.79      0.79      0.79       148
      weighted avg       0.79      0.79      0.79       148

Best Parameters for Naive Bayes: {}
Naive Bayes Classifier:
Best Parameters: {}
Accuracy: 0.72
Classification Report:
                    precision    recall  f1-score   support

Private University       0.76      0.59      0.67        69
 Public University       0.70      0.84      0.76        79

          accuracy                           0.72       148
         macro avg       0.73      0.71      0.71       148
      weighted avg       0.73      0.

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Assuming the last column is the target variable
target_column = 'Name of your current institution? '

# Remove leading and trailing spaces from the target column
df[target_column] = df[target_column].str.strip()

# Oversample both Public University and Private University to balance the classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=[target_column]), df[target_column])
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=X.select_dtypes(include=['object']).columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a more complex ANN model with Dropout and BatchNormalization
ann_model = Sequential()
ann_model.add(Dense(units=128, activation='relu', input_dim=X_train_scaled.shape[1]))
ann_model.add(Dropout(0.5))  # Increase dropout rate
ann_model.add(Dense(units=64, activation='relu'))
ann_model.add(BatchNormalization())
ann_model.add(Dense(units=32, activation='relu'))
ann_model.add(Dropout(0.3))  # Add another dropout layer
ann_model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with a lower learning rate
ann_model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the ANN model and get history
history = ann_model.fit(X_train_scaled, y_train, epochs=70, batch_size=16, validation_data=(X_test_scaled, y_test), verbose=1)

# Predictions using the ANN
y_pred_ann = (ann_model.predict(X_test_scaled) > 0.5).astype("int32").flatten()

# Evaluate the ANN
accuracy = accuracy_score(y_test, y_pred_ann)
print('For ANN, the accuracy is= ', accuracy)



Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
For ANN, the accuracy is=  0.777027027027027


In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Oversample both Public University and Private University to balance the classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=[target_column]), df[target_column])
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Assuming the last column is the target variable
target_column = balanced_df.columns[-1]

# Remove leading and trailing spaces from the target column
balanced_df[target_column] = balanced_df[target_column].str.strip()

# Separate features (X) and target variable (y)
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a more complex ANN model with Dropout and BatchNormalization
ann_model = Sequential()
ann_model.add(Dense(units=512, activation='relu', input_dim=X_train_scaled.shape[1]))
ann_model.add(BatchNormalization())
ann_model.add(Dropout(0.5))
ann_model.add(Dense(units=256, activation='relu'))
ann_model.add(BatchNormalization())
ann_model.add(Dropout(0.4))
ann_model.add(Dense(units=128, activation='relu'))
ann_model.add(Dropout(0.3))
ann_model.add(Dense(units=64, activation='relu'))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(units=1, activation='sigmoid'))

# Implement learning rate scheduling
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0001,
    decay_steps=1000,
    decay_rate=0.9
)
ann_model.compile(optimizer=Adam(learning_rate=lr_schedule), loss='binary_crossentropy', metrics=['accuracy'])

# Train the ANN model and get history
history = ann_model.fit(X_train_scaled, y_train, epochs=70, batch_size=16, validation_data=(X_test_scaled, y_test))

# Predictions using the ANN
y_pred_ann = (ann_model.predict(X_test_scaled) > 0.5).astype("int32").flatten()

# Evaluate the ANN
print('For ANN:')
print('Accuracy:', accuracy_score(y_test, y_pred_ann))
print('Classification Report:')
print(classification_report(y_test, y_pred_ann))



Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
For ANN:
Accuracy: 0.9060773480662984
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       151

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Define the file path
file_path = "/kaggle/input/undergrad/Undergraduate.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)


# Convert the target column to numeric, handling errors by setting them to NaN
balanced_df['What was your SSC GPA?'] = pd.to_numeric(balanced_df['What was your SSC GPA?'], errors='coerce')

# Drop rows with NaN values in the target column
balanced_df = balanced_df.dropna(subset=['What was your SSC GPA?'])

# Separate features (X) and target variable (y)
target_column = 'What was your SSC GPA?'
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended for linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Initialize and train the decision tree regressor model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_scaled, y_train)

# Initialize and train the random forest regressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)

# Initialize and train the gradient boosting regressor model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train_scaled, y_train)

# Initialize and train the support vector regressor model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)

# Initialize and train the k-nearest neighbors regressor model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_scaled, y_train)

# Initialize and train the lasso regression model
lasso_model = Lasso()
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set for each model
linear_pred = linear_model.predict(X_test_scaled)
dt_pred = dt_model.predict(X_test_scaled)
rf_pred = rf_model.predict(X_test_scaled)
gb_pred = gb_model.predict(X_test_scaled)
svr_pred = svr_model.predict(X_test_scaled)
knn_pred = knn_model.predict(X_test_scaled)
lasso_pred = lasso_model.predict(X_test_scaled)

# Evaluate each model
linear_mse = mean_squared_error(y_test, linear_pred)
linear_r2 = r2_score(y_test, linear_pred)

dt_mse = mean_squared_error(y_test, dt_pred)
dt_r2 = r2_score(y_test, dt_pred)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

gb_mse = mean_squared_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)

svr_mse = mean_squared_error(y_test, svr_pred)
svr_r2 = r2_score(y_test, svr_pred)

knn_mse = mean_squared_error(y_test, knn_pred)
knn_r2 = r2_score(y_test, knn_pred)

lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

# Print the results
print(f'Linear Regression - Mean Squared Error: {linear_mse}, R-squared: {linear_r2}')
print(f'Decision Tree Regressor - Mean Squared Error: {dt_mse}, R-squared: {dt_r2}')
print(f'Random Forest Regressor - Mean Squared Error: {rf_mse}, R-squared: {rf_r2}')
print(f'Gradient Boosting Regressor - Mean Squared Error: {gb_mse}, R-squared: {gb_r2}')
print(f'Support Vector Regressor - Mean Squared Error: {svr_mse}, R-squared: {svr_r2}')
print(f'K-Nearest Neighbors Regressor - Mean Squared Error: {knn_mse}, R-squared: {knn_r2}')
print(f'Lasso Regression - Mean Squared Error: {lasso_mse}, R-squared: {lasso_r2}')



Linear Regression - Mean Squared Error: 0.029818181362078706, R-squared: 0.44099083243354176
Decision Tree Regressor - Mean Squared Error: 0.020192610497237574, R-squared: 0.6214439020278467
Random Forest Regressor - Mean Squared Error: 0.010336774810338824, R-squared: 0.8062138058695268
Gradient Boosting Regressor - Mean Squared Error: 0.016326693365423757, R-squared: 0.6939192515971035
Support Vector Regressor - Mean Squared Error: 0.017477589250600334, R-squared: 0.672343108407284
K-Nearest Neighbors Regressor - Mean Squared Error: 0.021597116022099447, R-squared: 0.5951132732493252
Lasso Regression - Mean Squared Error: 0.05334118635725406, R-squared: -1.0336142513001079e-06


In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Define the file path
file_path = "/kaggle/input/undergrad/Undergraduate.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Oversample both Public University and Private University to balance the classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=[target_column]), df[target_column])
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Convert the target column to numeric, handling errors by setting them to NaN
balanced_df['What was your SSC GPA?'] = pd.to_numeric(balanced_df['What was your SSC GPA?'], errors='coerce')

# Drop rows with NaN values in the target column
balanced_df = balanced_df.dropna(subset=['What was your SSC GPA?'])

# Separate features (X) and target variable (y)
target_column = 'What was your SSC GPA?'
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended for linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Initialize and train the decision tree regressor model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_scaled, y_train)

# Initialize and train the random forest regressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)

# Initialize and train the gradient boosting regressor model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train_scaled, y_train)

# Initialize and train the support vector regressor model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)

# Initialize and train the k-nearest neighbors regressor model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_scaled, y_train)

# Initialize and train the lasso regression model
lasso_model = Lasso()
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set for each model
linear_pred = linear_model.predict(X_test_scaled)
dt_pred = dt_model.predict(X_test_scaled)
rf_pred = rf_model.predict(X_test_scaled)
gb_pred = gb_model.predict(X_test_scaled)
svr_pred = svr_model.predict(X_test_scaled)
knn_pred = knn_model.predict(X_test_scaled)
lasso_pred = lasso_model.predict(X_test_scaled)

# Evaluate each model
linear_mse = mean_squared_error(y_test, linear_pred)
linear_r2 = r2_score(y_test, linear_pred)

dt_mse = mean_squared_error(y_test, dt_pred)
dt_r2 = r2_score(y_test, dt_pred)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

gb_mse = mean_squared_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)

svr_mse = mean_squared_error(y_test, svr_pred)
svr_r2 = r2_score(y_test, svr_pred)

knn_mse = mean_squared_error(y_test, knn_pred)
knn_r2 = r2_score(y_test, knn_pred)

lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

# Print the results
print(f'Linear Regression - Mean Squared Error: {linear_mse}, R-squared: {linear_r2}')
print(f'Decision Tree Regressor - Mean Squared Error: {dt_mse}, R-squared: {dt_r2}')
print(f'Random Forest Regressor - Mean Squared Error: {rf_mse}, R-squared: {rf_r2}')
print(f'Gradient Boosting Regressor - Mean Squared Error: {gb_mse}, R-squared: {gb_r2}')
print(f'Support Vector Regressor - Mean Squared Error: {svr_mse}, R-squared: {svr_r2}')
print(f'K-Nearest Neighbors Regressor - Mean Squared Error: {knn_mse}, R-squared: {knn_r2}')
print(f'Lasso Regression - Mean Squared Error: {lasso_mse}, R-squared: {lasso_r2}')


Linear Regression - Mean Squared Error: 0.029818181362078706, R-squared: 0.44099083243354176
Decision Tree Regressor - Mean Squared Error: 0.020312776243093928, R-squared: 0.6191911236727392
Random Forest Regressor - Mean Squared Error: 0.010078814467764484, R-squared: 0.8110498552119267
Gradient Boosting Regressor - Mean Squared Error: 0.01636828662950495, R-squared: 0.6931394919045835
Support Vector Regressor - Mean Squared Error: 0.017477589250600334, R-squared: 0.672343108407284
K-Nearest Neighbors Regressor - Mean Squared Error: 0.021597116022099447, R-squared: 0.5951132732493252
Lasso Regression - Mean Squared Error: 0.05334118635725406, R-squared: -1.0336142513001079e-06


In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv("/kaggle/input/undergrad/Undergraduate.csv")

# Assuming the last column is the target variable
target_column = df.columns[-1]

# Remove leading and trailing spaces from the target column
df[target_column] = df[target_column].str.strip()

# Oversample both Public University and Private University to balance the classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=[target_column]), df[target_column])
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Separate features (X) and target variable (y) from the balanced dataset
X = balanced_df.drop(columns=[target_column])
y = balanced_df[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)  # Assuming y contains class labels 0 and 1

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Data preprocessing
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# ELM implementation
class ELMClassifierManual:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.input_weights = np.random.rand(input_size, hidden_size)
        self.bias = np.random.rand(hidden_size)
        self.output_weights = None

    def train(self, X, y):
        hidden_layer_output = self.sigmoid(X.dot(self.input_weights) + self.bias)
        self.output_weights = np.linalg.pinv(hidden_layer_output).dot(y)

    def predict(self, X):
        hidden_layer_output = self.sigmoid(X.dot(self.input_weights) + self.bias)
        predictions = hidden_layer_output.dot(self.output_weights)
        return np.round(predictions).astype(int)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

# Initialize ELMClassifier
elm_classifier = ELMClassifierManual(input_size=X_train_scaled.shape[1], hidden_size=100)

# Train ELMClassifier
elm_classifier.train(X_train_scaled, y_train)

# Predictions
elm_predictions = elm_classifier.predict(X_test_scaled)

# Exclude instances where the true class is -1 during evaluation
valid_test_indices = (y_test != -1)
y_test_valid = y_test[valid_test_indices]
elm_predictions_valid = elm_predictions[valid_test_indices]

# Evaluate ELMClassifier
elm_accuracy = accuracy_score(y_test_valid, elm_predictions_valid)
elm_report = classification_report(y_test_valid, elm_predictions_valid)

print("ELM Classifier (Manual Implementation):")
print(f"Accuracy: {elm_accuracy:.2f}")
print(f"Classification Report:\n{elm_report}\n{'='*40}")




ELM Classifier (Manual Implementation):
Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72        69
           1       0.75      0.82      0.78        79

    accuracy                           0.76       148
   macro avg       0.76      0.75      0.75       148
weighted avg       0.76      0.76      0.76       148

