<a href="https://colab.research.google.com/github/KruthiSriSai/ScholarshopPredictor/blob/main/ScholarshipPro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker # Installs the faker package

import pandas as pd
import numpy as np
from faker import Faker # Now faker can be imported
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib



In [None]:
# Initialize the Faker instance
fake = Faker()

# Set the random seed for reproducibility
np.random.seed(42)

# List of sports, levels, achievements, academic awards, and leadership roles
sports_list = ['Basketball', 'Soccer', 'Tennis', 'Swimming', 'Athletics']
sport_levels = ['School', 'District', 'State', 'National']
sport_achievements = ['Participation', 'Winner', 'Runner-up']
academic_awards_list = ['Dean\'s List', 'Science Fair Winner', 'Math Olympiad Medalist', 'Valedictorian', 'Essay Contest Winner']
leadership_roles_list = ['Class President', 'Club Leader', 'Sports Captain', 'Debate Team Captain', 'Volunteer Coordinator']

# Number of synthetic records to generate
num_students = 10000
# Generate synthetic data
def generate_student_data(num_records):
    data = {
        'Student_ID': [fake.unique.random_int(min=10000, max=25000) for _ in range(num_records)],
        'GPA': np.random.uniform(2.0, 4.0, num_records),  # GPA between 2.0 and 4.0
        'SAT_Score': np.random.randint(500, 1600, num_records),  # SAT score between 500 and 1600
        'Academic_Awards': np.random.randint(0, 5, num_records),  # Number of academic awards (0 to 4)
        'Academic_Award_Type': [np.random.choice(academic_awards_list, size=np.random.randint(0, 5), replace=False).tolist() for _ in range(num_records)],  # Type of academic awards
        'Leadership_Roles': np.random.randint(0, 5, num_records),  # Number of leadership roles (0 to 6)
        'Leadership_Role_Type': [np.random.choice(leadership_roles_list, size=np.random.randint(0, 3), replace=False).tolist() for _ in range(num_records)],  # Type of leadership roles
        'Volunteering_Hours': np.random.randint(0, 200, num_records),  # Volunteering hours (0 to 200)
        'Sports_Involvement': np.random.choice([0, 1], num_records),  # 1 = involved in sports, 0 = not involved
        'Sport_Type': [np.random.choice(sports_list) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],  # Type of sport
        'Sport_Level': [np.random.choice(sport_levels) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],  # Level of sport
        'Sport_Achievement': [np.random.choice(sport_achievements) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],  # Achievement in sport
        'Household_Income': np.random.randint(20000, 150000, num_records),  # Income between $20k and $150k
        'Number_of_Dependents': np.random.randint(1, 6, num_records),  # Number of dependents (1 to 5)
        'Received_Financial_Aid': np.random.choice([0, 1], num_records),  # 1 = received aid, 0 = no aid
        'Recommendation_Letters': np.random.randint(1, 6, num_records),  # Quality of recommendation letters (1 to 5)
        'Scholarship_Eligibility': np.where(  # Generate target based on some conditions
            (np.random.uniform(2.0, 4.0, num_records) > 3.5) &
            (np.random.randint(900, 1600, num_records) > 1200) &
            ((np.random.randint(20000, 150000, num_records) < 60000) | (np.random.choice([0, 1], num_records) == 1)) &
            (np.random.randint(1, 6, num_records) > 3),
            1, 0)
    }

    # Convert the dictionary to a Pandas DataFrame
    df = pd.DataFrame(data)

    # Ensure consistency: If a student is not involved in sports, set related fields to 'None'
    df.loc[df['Sports_Involvement'] == 0, ['Sport_Type', 'Sport_Level', 'Sport_Achievement']] = 'None'

    return df

# Generate the synthetic dataset
synthetic_data = generate_student_data(num_students)

# Display the first few rows of the dataset
print(synthetic_data.head())

# Save the dataset to a CSV file (optional)
synthetic_data.to_csv('synthetic_student_scholarship_data_with_awards_and_roles.csv', index=False)


   Student_ID       GPA  SAT_Score  Academic_Awards  \
0       10675  2.749080       1363                4   
1       15102  3.901429       1047                1   
2       20728  3.463988       1560                4   
3       10687  3.197317       1507                4   
4       14759  2.312037       1537                3   

                                 Academic_Award_Type  Leadership_Roles  \
0  [Science Fair Winner, Dean's List, Math Olympi...                 3   
1                       [Dean's List, Valedictorian]                 4   
2                                                 []                 2   
3                                                 []                 4   
4                                                 []                 3   

             Leadership_Role_Type  Volunteering_Hours  Sports_Involvement  \
0                              []                 105                   1   
1         [Volunteer Coordinator]                 111                 

In [None]:
length = len(synthetic_data)
print(length)

10000


In [None]:
# Define feature columns
features = ['GPA', 'SAT_Score', 'Academic_Awards', 'Leadership_Roles', 'Volunteering_Hours',
            'Household_Income', 'Number_of_Dependents', 'Received_Financial_Aid', 'Recommendation_Letters']

# Separate features and target
X = synthetic_data[features]
y = synthetic_data['Scholarship_Eligibility']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (GPA, SAT_Score, etc.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=1000, random_state=42)
model.fit(X_train_scaled, y_train)

# Save the trained model and scaler using joblib for later use
joblib.dump(model, 'scholarship_prediction_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Accuracy: 96.40%
Confusion Matrix:
 [[1928    0]
 [  72    0]]


In [None]:
# Load the trained model and scaler
model = joblib.load('scholarship_prediction_model.pkl')
scaler = joblib.load('scaler.pkl')

# Example new student data
new_student = {
    'GPA': 2.1,
    'SAT_Score': 1350,
    'Academic_Awards': 0,
    'Leadership_Roles': 2,
    'Volunteering_Hours': 10,
    'Household_Income': 50000,
    'Number_of_Dependents': 3,
    'Received_Financial_Aid': 0,
    'Recommendation_Letters': 0
}

# Convert new student data into a DataFrame for prediction
new_student_df = pd.DataFrame([new_student])

# Scale the new student's data
new_student_scaled = scaler.transform(new_student_df)

# Make the prediction
prediction = model.predict(new_student_scaled)

# Print prediction result
if prediction[0] == 1:
    print("The student is eligible for a scholarship.")
else:
    print("The student is not eligible for a scholarship.")


The student is not eligible for a scholarship.


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Path where you want to save the file in Google Drive
file_path = '/content/drive/My Drive/synthetic_student_scholarship_data_with_awards_and_roles.csv'

# Save the dataset to the specified path in Google Drive
synthetic_data.to_csv(file_path, index=False)

print(f'Synthetic dataset saved to {file_path}')


Synthetic dataset saved to /content/drive/My Drive/synthetic_student_scholarship_data_with_awards_and_roles.csv


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize the Faker instance
fake = Faker()

# Set the random seed for reproducibility
np.random.seed(42)

# List of sports, levels, achievements, academic awards, and leadership roles
sports_list = ['Basketball', 'Soccer', 'Tennis', 'Swimming', 'Athletics']
sport_levels = ['School', 'District', 'State', 'National']
sport_achievements = ['Participation', 'Winner', 'Runner-up']
academic_awards_list = ['Dean\'s List', 'Science Fair Winner', 'Math Olympiad Medalist', 'Valedictorian', 'Essay Contest Winner']
leadership_roles_list = ['Class President', 'Club Leader', 'Sports Captain', 'Debate Team Captain', 'Volunteer Coordinator']

# Number of synthetic records to generate
num_students = 10000

# Generate synthetic student data
def generate_student_data(num_records):
    data = {
        'Student_ID': [fake.unique.random_int(min=10000, max=25000) for _ in range(num_records)],
        'GPA': np.random.uniform(2.0, 4.0, num_records),  # GPA between 2.0 and 4.0
        'SAT_Score': np.random.randint(500, 1600, num_records),  # SAT score between 500 and 1600
        'Academic_Awards': np.random.randint(0, 5, num_records),  # Number of academic awards (0 to 4)
        'Academic_Award_Type': [np.random.choice(academic_awards_list, size=np.random.randint(0, 5), replace=False).tolist() for _ in range(num_records)],
        'Leadership_Roles': np.random.randint(0, 5, num_records),  # Number of leadership roles (0 to 4)
        'Leadership_Role_Type': [np.random.choice(leadership_roles_list, size=np.random.randint(0, 3), replace=False).tolist() for _ in range(num_records)],
        'Volunteering_Hours': np.random.randint(0, 200, num_records),  # Volunteering hours (0 to 200)
        'Sports_Involvement': np.random.choice([0, 1], num_records),  # 1 = involved in sports, 0 = not involved
        'Sport_Type': [np.random.choice(sports_list) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],
        'Sport_Level': [np.random.choice(sport_levels) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],
        'Sport_Achievement': [np.random.choice(sport_achievements) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],
        'Household_Income': np.random.randint(20000, 150000, num_records),  # Income between $20k and $150k
        'Number_of_Dependents': np.random.randint(1, 6, num_records),  # Number of dependents (1 to 5)
        'Received_Financial_Aid': np.random.choice([0, 1], num_records),  # 1 = received aid, 0 = no aid
        'Recommendation_Letters': np.random.randint(1, 6, num_records),  # Quality of recommendation letters (1 to 5)
        'Scholarship_Eligibility': np.where(  # Generate target based on some conditions
            (np.random.uniform(2.0, 4.0, num_records) > 3.5) &
            (np.random.randint(900, 1600, num_records) > 1200) &
            ((np.random.randint(20000, 150000, num_records) < 60000) | (np.random.choice([0, 1], num_records) == 1)) &
            (np.random.randint(1, 6, num_records) > 3),
            1, 0)
    }

    # Convert the dictionary to a Pandas DataFrame
    df = pd.DataFrame(data)

    # Ensure consistency: If a student is not involved in sports, set related fields to 'None'
    df.loc[df['Sports_Involvement'] == 0, ['Sport_Type', 'Sport_Level', 'Sport_Achievement']] = 'None'

    return df

# Generate the synthetic dataset
synthetic_data = generate_student_data(num_students)

# Save the dataset to a CSV file
synthetic_data.to_csv('synthetic_student_scholarship_data.csv', index=False)

# Display the first few rows of the dataset
print(synthetic_data.head())


   Student_ID       GPA  SAT_Score  Academic_Awards  \
0       18364  2.749080       1363                4   
1       17426  3.901429       1047                1   
2       21047  3.463988       1560                4   
3       14438  3.197317       1507                4   
4       21820  2.312037       1537                3   

                                 Academic_Award_Type  Leadership_Roles  \
0  [Science Fair Winner, Dean's List, Math Olympi...                 3   
1                       [Dean's List, Valedictorian]                 4   
2                                                 []                 2   
3                                                 []                 4   
4                                                 []                 3   

             Leadership_Role_Type  Volunteering_Hours  Sports_Involvement  \
0                              []                 105                   1   
1         [Volunteer Coordinator]                 111                 

In [None]:
# Load the dataset with college names and their indices
college_data = pd.read_csv('/content/drive/MyDrive/College_details.csv')  # Replace with your dataset filename
college_index_map = dict(zip(college_data['COLLEGE NAME'], college_data['INDEX']))


In [None]:
# Define feature columns
features = ['GPA', 'SAT_Score', 'Academic_Awards', 'Leadership_Roles', 'Volunteering_Hours',
            'Household_Income', 'Number_of_Dependents', 'Received_Financial_Aid', 'Recommendation_Letters']

# Separate features and target
X = synthetic_data[features]
y = synthetic_data['Scholarship_Eligibility']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (GPA, SAT_Score, etc.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import pandas as pd
import numpy as np
from faker import Faker

# Load the dataset with college names and their indices
college_data = pd.read_csv('/content/drive/MyDrive/College_details.csv')  # Replace with your dataset filename
college_index_map = dict(zip(college_data['COLLEGE NAME'], college_data['INDEX']))

# Initialize the Faker instance
fake = Faker()

# Set the random seed for reproducibility
np.random.seed(42)

# List of sports, levels, achievements, academic awards, and leadership roles
sports_list = ['Basketball', 'Soccer', 'Tennis', 'Swimming', 'Athletics']
sport_levels = ['School', 'District', 'State', 'National']
sport_achievements = ['Participation', 'Winner', 'Runner-up']
academic_awards_list = ['Dean\'s List', 'Science Fair Winner', 'Math Olympiad Medalist', 'Valedictorian', 'Essay Contest Winner']
leadership_roles_list = ['Class President', 'Club Leader', 'Sports Captain', 'Debate Team Captain', 'Volunteer Coordinator']

# Number of synthetic records to generate
num_students = 10000

# Generate synthetic student data
def generate_student_data(num_records):
    data = {
        'Student_ID': [fake.unique.random_int(min=10000, max=25000) for _ in range(num_records)],
        'GPA': np.random.uniform(2.0, 4.0, num_records),  # GPA between 2.0 and 4.0
        'SAT_Score': np.random.randint(500, 1600, num_records),  # SAT score between 500 and 1600
        'Academic_Awards': np.random.randint(0, 5, num_records),  # Number of academic awards (0 to 4)
        'Academic_Award_Type': [np.random.choice(academic_awards_list, size=np.random.randint(0, 5), replace=False).tolist() for _ in range(num_records)],  # Type of academic awards
        'Leadership_Roles': np.random.randint(0, 5, num_records),  # Number of leadership roles (0 to 6)
        'Leadership_Role_Type': [np.random.choice(leadership_roles_list, size=np.random.randint(0, 3), replace=False).tolist() for _ in range(num_records)],  # Type of leadership roles
        'Volunteering_Hours': np.random.randint(0, 200, num_records),  # Volunteering hours (0 to 200)
        'Sports_Involvement': np.random.choice([0, 1], num_records),  # 1 = involved in sports, 0 = not involved
        'Sport_Type': [np.random.choice(sports_list) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],  # Type of sport
        'Sport_Level': [np.random.choice(sport_levels) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],  # Level of sport
        'Sport_Achievement': [np.random.choice(sport_achievements) if involvement == 1 else 'None' for involvement in np.random.choice([0, 1], num_records)],  # Achievement in sport
        'Household_Income': np.random.randint(20000, 150000, num_records),  # Income between $20k and $150k
        'Number_of_Dependents': np.random.randint(1, 6, num_records),  # Number of dependents (1 to 5)
        'Received_Financial_Aid': np.random.choice([0, 1], num_records),  # 1 = received aid, 0 = no aid
        'Recommendation_Letters': np.random.randint(1, 6, num_records),  # Quality of recommendation letters (1 to 5)
        'College_Name': np.random.choice(college_data['COLLEGE NAME'], num_records),  # Random college names
        'Scholarship_Eligibility': np.where(  # Generate target based on some conditions
            (np.random.uniform(2.0, 4.0, num_records) > 3.5) &
            (np.random.randint(900, 1600, num_records) > 1200) &
            ((np.random.randint(20000, 150000, num_records) < 60000) | (np.random.choice([0, 1], num_records) == 1)) &
            (np.random.randint(1, 6, num_records) > 3),
            1, 0)
    }

    # Convert the dictionary to a Pandas DataFrame
    df = pd.DataFrame(data)

    # Ensure consistency: If a student is not involved in sports, set related fields to 'None'
    df.loc[df['Sports_Involvement'] == 0, ['Sport_Type', 'Sport_Level', 'Sport_Achievement']] = 'None'

    return df

# Function to get eligibility criteria based on college name
def get_eligibility_criteria(college_name):
    if college_name in college_index_map:
        index = college_index_map[college_name]
        if 1 <= index <= 150:
            return (3.8, 1500)  # GPA, SAT for colleges ranked 1-150
        elif 151 <= index <= 300:
            return (3.5, 1400)  # GPA, SAT for colleges ranked 151-300
        elif 301 <= index <= 450:
            return (3.2, 1300)  # GPA, SAT for colleges ranked 301-450
        elif 451 <= index <= 600:
            return (3.0, 1200)  # GPA, SAT for colleges ranked 451-600
        elif 601 <= index <= 750:
            return (2.9, 1100)  # GPA, SAT for colleges ranked 601-750
        elif 751 <= index <= 900:
            return (2.8, 1000)  # GPA, SAT for colleges ranked 751-900
        elif 901 <= index <= 1050:
            return (2.7, 900)   # GPA, SAT for colleges ranked 901-1050
        elif 1051 <= index <= 1200:
            return (2.6, 800)   # GPA, SAT for colleges ranked 1051-1200
        elif 1201 <= index <= 1350:
            return (2.5, 700)   # GPA, SAT for colleges ranked 1201-1350
        elif 1351 <= index <= 1503:
            return (2.3, 550)   # GPA, SAT for colleges ranked 1351-1503
    return (None, None)  # Invalid college name

# Function to provide personalized recommendations
def provide_recommendations(user_gpa, user_sat_score, required_gpa, required_sat):
    recommendations = []

    if user_gpa < required_gpa:
        gpa_diff = required_gpa - user_gpa
        recommendations.append(f"Increase GPA by at least {gpa_diff:.2f}. Consider tutoring, study groups, or retaking challenging courses.")

    if user_sat_score < required_sat:
        sat_diff = required_sat - user_sat_score
        recommendations.append(f"Improve SAT score by at least {sat_diff}. Consider SAT prep courses, practice tests, and setting a study schedule.")

    if user_gpa >= 2.0 and user_gpa < 3.0:
        recommendations.append("Engage in more academic activities such as joining study clubs or attending academic workshops.")

    if user_gpa >= 3.0 and user_gpa < 3.5:
        recommendations.append("Aim for additional academic awards or honors to strengthen your application.")

    recommendations.append("Participate in extracurricular activities such as volunteering or leadership roles to enhance your profile.")

    return recommendations

# Generate the synthetic dataset
synthetic_data = generate_student_data(num_students)

# Display the first few rows of the dataset
#print(synthetic_data.head())

# Check eligibility for a specific college name (example usage)
user_college_name = input("Enter the college name: ")  # Replace with actual user input
user_gpa = float(input("Enter the GPA: "))  # Replace with actual user input
user_sat_score = int(input("Enter the SAT score: "))  # Replace with actual user input
user_academic_awards = int(input("Enter the number of academic awards (0 to 4): "))
user_leadership_roles = int(input("Enter the number of leadership roles (0 to 2): "))
user_volunteering_hours = int(input("Enter the number of volunteering hours (0 to 200): "))
user_sports_involvement = int(input("Enter if the student is involved in sports (1 for Yes, 0 for No): "))
user_household_income = int(input("Enter the household income (in dollars, e.g., 45000): "))
user_number_of_dependents = int(input("Enter the number of dependents (1 to 5): "))
user_received_financial_aid = int(input("Enter if the student received financial aid (1 for Yes, 0 for No): "))

# Example of how to incorporate these values in the scholarship eligibility logic

required_gpa, required_sat = get_eligibility_criteria(user_college_name)

if required_gpa is not None and required_sat is not None:
    if user_gpa >= required_gpa and user_sat_score >= required_sat:
        print(f"You are eligible for Scholarship in {user_college_name}.")
    else:
        print(f"You are not eligible for Scholarship in {user_college_name}. Required GPA: {required_gpa}, Required SAT: {required_sat}")
        # Provide recommendations
        recommendations = provide_recommendations(user_gpa, user_sat_score, required_gpa, required_sat)
        print("Recommendations to improve eligibility:")
        for rec in recommendations:
            print("- " + rec)
else:
    print("College name not found in the dataset.")

# Save the dataset to a CSV file (optional)
synthetic_data.to_csv('synthetic_student_scholarship_data_with_awards_and_roles.csv', index=False)


Enter the college name: University of Idaho
Enter the GPA: 2.4
Enter the SAT score: 600
Enter the number of academic awards (0 to 4): 0
Enter the number of leadership roles (0 to 2): 0
Enter the number of volunteering hours (0 to 200): 10
Enter if the student is involved in sports (1 for Yes, 0 for No): 0
Enter the household income (in dollars, e.g., 45000): 30000
Enter the number of dependents (1 to 5): 1
Enter if the student received financial aid (1 for Yes, 0 for No): 1
You are not eligible for Scholarship in University of Idaho. Required GPA: 2.6, Required SAT: 800
Recommendations to improve eligibility:
- Increase GPA by at least 0.20. Consider tutoring, study groups, or retaking challenging courses.
- Improve SAT score by at least 200. Consider SAT prep courses, practice tests, and setting a study schedule.
- Engage in more academic activities such as joining study clubs or attending academic workshops.
- Participate in extracurricular activities such as volunteering or leadersh