In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score

# Load the dataset
df = pd.read_csv("Data4.csv")

# Check for missing values
if df.isnull().sum().any():
    print("Warning: Missing values detected. Consider imputation or removal.")

# Split the dataset into features (X) and target (y)
X = df.drop(columns=["StudentID", "Scholarship"])
y = df["Scholarship"]

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Apply oversampling
oversampler = SMOTE(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_scaled, y_encoded)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=cv, scoring='accuracy',verbose=3)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

# Evaluate the model
y_train_pred = best_rf_model.predict(X_train)
y_test_pred = best_rf_model.predict(X_test)

print("Training classification report:")
print(classification_report(y_train, y_train_pred))

print("Testing classification report:")
print(classification_report(y_test, y_test_pred))

# ROC AUC score
y_test_prob = best_rf_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_test_prob)
print("ROC AUC score:", roc_auc)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=1.000 total time=   9.9s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=1.000 total time=  10.2s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=1.000 total time=   8.9s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=1.000 total time=   8.2s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=1.000 total time=   8.5s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=1.000 total time=  16.3s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=1.000 total time=  16.5s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=1.000 t

KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv("./Data4.csv")

# Split the dataset into features (X) and target (y)
X = df.drop(columns=["StudentID", "Scholarship"])
y = df["Scholarship"]

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode target labels
y_encoded = label_encoder.fit_transform(y)

# Apply oversampling
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_scaled, y_encoded)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Convert target to one-hot encoding
y_train_onehot = tf.keras.utils.to_categorical(y_train)
y_test_onehot = tf.keras.utils.to_categorical(y_test)

# Build the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(13, activation='softmax')  # Assuming 2 classes for Scholarship
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_onehot,
                    validation_split=0.2,
                    epochs=50,
                    batch_size=32,
                    verbose=1)

# Evaluate the model
train_loss, train_acc = model.evaluate(X_train, y_train_onehot, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test_onehot, verbose=0)

print(f"Training accuracy: {train_acc}")
print(f"Testing accuracy: {test_acc}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training accuracy: 0.99374920129776
Testing accuracy: 0.9934834837913513


In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [18]:
# Save the entire model to a HDF5 file
model.save('my_model.h5')


  saving_api.save_model(


In [10]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Function to preprocess input data
def preprocess_input(input_dict):

    myList = []
    x = input_dict
    required_list = ['GPA', '10th Percentage', '12th Percentage', 'Family Income', 'Extracurricular Activities_high', 'Extracurricular Activities_low', 'Extracurricular Activities_medium', 'Essay Quality_excellent', 'Essay Quality_fair', 'Essay Quality_good', 'Essay Quality_poor', 'Letters of Recommendation_moderate', 'Letters of Recommendation_strong', 'Letters of Recommendation_weak', 'Financial Need_high', 'Financial Need_low', 'Financial Need_medium', 'Major_Arts', 'Major_Business', 'Major_Engineering', 'Major_Medicine', 'Major_Science', 'State of Residence_Delhi', 'State of Residence_Karnataka', 'State of Residence_Kerala', 'State of Residence_Maharashtra', 'State of Residence_Tamil Nadu', 'State of Residence_Uttar Pradesh', 'Leadership Experience_no', 'Leadership Experience_yes', 'Volunteer Work_no', 'Volunteer Work_yes', 'Work Experience_no', 'Work Experience_yes', 'Family Background_high', 'Family Background_low', 'Family Background_medium']
    for cols in required_list:
      if cols=='GPA':
        myList.append(x['GPA'])
      elif cols=='10th Percentage':
        myList.append(x['10th Percentage'])
      elif cols=='12th Percentage':
        myList.append(x['12th Percentage'])
      elif cols=='Family Income':
        myList.append(x['Family Income'])
      elif cols.split("_")[0] in x:
        #print(cols)
        #print(cols.split("_")[1])
        if cols.split("_")[1]==x[cols.split("_")[0]]:
          myList.append(True)
        else:
          myList.append(False)



    input_df_scaled = scaler.transform([myList])

    return input_df_scaled

# Trained scaler from the training data


# Preprocess the input data
input_data = {
    'GPA': '10',
    '10th Percentage': '90',
    '12th Percentage': '96',
    'Family Income': '100000',
    'Extracurricular Activities': 'high',
    'Essay Quality': 'poor',
    'Letters of Recommendation': 'weak',
    'Financial Need': 'low',
    'Leadership Experience': 'no',
    'Volunteer Work': 'no',
    'Work Experience': 'yes',
    'Family Background': 'medium',
    'Major':'Engineering',
    'State of Residence': 'Karnataka'

}

input_data_scaled = preprocess_input(input_data)

# Load the trained TensorFlow model
model = tf.keras.models.load_model('my_model.h5')  # Replace 'your_model_path' with the path to your trained model

# Make predictions
predictions = model.predict(input_data_scaled)

# Convert predictions to class labels
predicted_class = np.argmax(predictions, axis=1)[0]  # Assuming binary classification

# Inverse transform using LabelEncoder
predicted_class_label = label_encoder.inverse_transform([predicted_class])[0]

print(f"Predicted Scholarship: {predicted_class_label}")

# Get the test set true labels for confusion matrix
# Assuming X_test and y_test are your test set and true labels
# y_test_labels = label_encoder.inverse_transform(y_test)

# For demonstration, let's assume y_test_labels contains the true labels for the test set

# Create confusion matrix
# conf_matrix = confusion_matrix(y_test_labels, model.predict_classes(X_test))  # Replace with your actual test set and true labels
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Not Scholarship', 'Scholarship'], yticklabels=['Not Scholarship', 'Scholarship'])
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.title('Confusion Matrix')
# plt.show()




Predicted Scholarship: Post-Graduate Indira Gandhi Scholarship - 36,200 per annum


In [27]:
import pickle
with open('scaler_tensorflow.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [32]:
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [41]:

myList = []
input_data = {
    'GPA': '9.77',
    '10th Percentage': '61',
    '12th Percentage': '86',
    'Family Income': '85643',
    'Extracurricular Activities': 'low',
    'Essay Quality': 'poor',
    'Letters of Recommendation': 'moderate',
    'Financial Need': 'medium',
    'Leadership Experience': 'yes',
    'Volunteer Work': 'yes',
    'Work Experience': 'yes',
    'Family Background': 'low',
    'Major':'Engineering',
    'State of Residence': 'Kerela'

}
x = input_data
required_list = ['GPA', '10th Percentage', '12th Percentage', 'Family Income', 'Extracurricular Activities_high', 'Extracurricular Activities_low', 'Extracurricular Activities_medium', 'Essay Quality_excellent', 'Essay Quality_fair', 'Essay Quality_good', 'Essay Quality_poor', 'Letters of Recommendation_moderate', 'Letters of Recommendation_strong', 'Letters of Recommendation_weak', 'Financial Need_high', 'Financial Need_low', 'Financial Need_medium', 'Major_Arts', 'Major_Business', 'Major_Engineering', 'Major_Medicine', 'Major_Science', 'State of Residence_Delhi', 'State of Residence_Karnataka', 'State of Residence_Kerala', 'State of Residence_Maharashtra', 'State of Residence_Tamil Nadu', 'State of Residence_Uttar Pradesh', 'Leadership Experience_no', 'Leadership Experience_yes', 'Volunteer Work_no', 'Volunteer Work_yes', 'Work Experience_no', 'Work Experience_yes', 'Family Background_high', 'Family Background_low', 'Family Background_medium']
for cols in required_list:
  if cols=='GPA':
    myList.append(x['GPA'])
  elif cols=='10th Percentage':
    myList.append(x['10th Percentage'])
  elif cols=='12th Percentage':
    myList.append(x['12th Percentage'])
  elif cols=='Family Income':
    myList.append(x['Family Income'])
  elif cols.split("_")[0] in x:
    #print(cols)
    #print(cols.split("_")[1])
    if cols.split("_")[1]==x[cols.split("_")[0]]:
      myList.append(True)
    else:
      myList.append(False)
with open('scaler_tensorflow.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('label_encoder_tensorflow.pkl', 'rb') as f:
    label_encode = pickle.load(f)

input_df_scaled = scaler.transform([myList])



In [42]:
predictions = model.predict(input_data_scaled)
predicted_class_label = label_encode.inverse_transform([predicted_class])[0]
print(predicted_class_label)

Post-Graduate Indira Gandhi Scholarship - 36,200 per annum


In [35]:
!python --version

Python 3.10.12


In [40]:
import sklearn
print(sklearn.__version__)

1.2.2


In [43]:
import tensorflow
print(tensorflow.__version__)

2.15.0
