<a href="https://www.kaggle.com/code/jeevikasharma2003/grad2masters-admission-prediction-model?scriptVersionId=236204651" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv("/kaggle/input/graduate-admissions/Admission_Predict_Ver1.1.csv")
data.head(10)

In [None]:
data.isnull().sum()

In [None]:
data['University Rating'].value_counts()

In [None]:
data['Research'].value_counts()

In [None]:
data.drop(columns=['Serial No.'],inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
#Rename columns for clarity
data.columns = ['GRE', 'TOEFL', 'University_Rating', 'SOP', 'LOR', 'CGPA', 'Research', 'Chance_of_Admit']

In [None]:
print(data.info())

In [None]:
print(data.describe())

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from scipy.stats import gaussian_kde

# 📊 1. Histogram of GRE Scores

In [None]:
fig=px.histogram(data,x='GRE',nbins=20, title='Distribution of GRE Scores')
fig.update_layout(bargap=0.1,template='plotly_dark', yaxis_title='Number of Students')
fig.show(renderer='iframe')

### 📌 **Inference**: Most applicants have GRE scores between 320-324, indicating a competitive applicant pool.

# 📈 2. KDE Plot of CGPA

In [None]:
data['CGPA'].skew()

In [None]:
fig = px.histogram(data, x='CGPA', nbins=20, marginal="violin", histnorm='probability density', title='CGPA Distribution')


# Extract the CGPA column
x = data['CGPA']
# Create KDE line using scipy
kde = gaussian_kde(x)
x_range = np.linspace(x.min(), x.max(), 100)
kde_values = kde(x_range)

# Add KDE to the same figure
fig.add_trace(go.Scatter(x=x_range, y=kde_values, mode='lines', name='KDE'))


fig.update_layout(bargap=0.1,template='plotly_dark', yaxis_title='Number of Students')
fig.show(renderer='iframe')


### 📌 **Inference:** CGPA is very slightly left skewed i.e, almost symmetrical with most applicants scoring above 8.5.

# 📊 3. Box Plot for TOEFL

In [None]:
fig = px.box(data, y="TOEFL", title="Box Plot for TOEFL Scores")

fig.update_layout(template='plotly_dark')
fig.show(renderer='iframe')



### 📌 **Inference:** TOEFL scores are fairly consistent among applicants.

# 📊 4. Bar Plot - University Rating Count

In [None]:
Rating_matrix=data['University_Rating'].value_counts().sort_index().reset_index()
Rating_matrix

In [None]:
fig = px.bar(Rating_matrix,y='count', x='University_Rating', title='University Rating Distribution')
fig.update_layout(template='plotly_dark',yaxis_title='Number of applicants')
fig.show(renderer='iframe')



### 📌 **Inference:** Most applicants applied to universities with a rating of 3. It's unusual that university with rating 2 got higher applications than university with rating 4 and 5.

# 📉 5. GRE vs Chance of Admit

In [None]:
fig = px.scatter(data, x='GRE', y='Chance_of_Admit', color='University_Rating',
                 trendline='ols', title='GRE vs Chance of Admit (Colored by University Rating)')
fig.update_layout(template='plotly_dark')
fig.show(renderer='iframe')

### 📌 **Inference:** Higher GRE scores tend to correlate with a higher chance of admission, especially in top-rated universities.

# 📉 6. CGPA vs Chance of Admit

In [None]:
fig = px.scatter(data, x='CGPA', y='Chance_of_Admit', color='Research',
                 trendline='ols', title='CGPA vs Chance of Admit (Colored by Research)')
fig.update_layout(template='plotly_dark')
fig.show(renderer='iframe')

### 📌 **Inference:** Research experience boosts admission chances at similar CGPA levels.

# 📊 7. Box Plot - Research vs Chance of Admit

In [None]:
fig = px.box(data, x="Research", y="Chance_of_Admit", title="Chance of Admit by Research Experience")
fig.update_layout(template='plotly_dark')
fig.show(renderer='iframe')


### 📌 **Inference:** Applicants with research experience (1) have higher median chances of admission.

# 📊 8. Heatmap - Correlation Matrix

In [None]:
corr = data.corr().round(2)

fig = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Cividis',
    text=corr.values,
    texttemplate="%{text}"
))
fig.update_layout(title='Correlation Heatmap',template='plotly_dark')
fig.show(renderer='iframe')

### 📌 **Inference:** CGPA, TOEFL and GRE have the strongest correlation with Chance_of_Admit.

# 📈 9. 3D Scatter Plot - GRE, CGPA, TOEFL vs Chance

In [None]:
fig = px.scatter_3d(data, x='GRE', y='CGPA', z='TOEFL',
                    color='Chance_of_Admit',
                    title="3D Scatter: GRE, CGPA, TOEFL vs Chance of Admit")
fig.show(renderer='iframe')


### 📌 **Inference:** High GRE + CGPA + TOEFL lead to higher admission chances (yellow zone).

# 📊 10. SOP & LOR vs Chance of Admit

In [None]:
fig = px.scatter(data, x='SOP', y='Chance_of_Admit', size='LOR', color='LOR',
                 title='SOP vs Chance of Admit (Bubble Size = LOR Strength)')
fig.show(renderer='iframe')

### 📌 **Inference:** Strong LOR combined with high SOP enhances admission chances.

# **<i>Let's do Prediction now!!</i>**<br>
## We'll Predict chances of Admission.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
data.head(5)

In [None]:
Y=data['Chance_of_Admit']


In [None]:
X=data.iloc[:,[0,1,2,3,4,5,6]]

In [None]:
Y

In [None]:
X

In [None]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



 
<div style="background-color: #f0f0f0; 
            padding: 20px; 
            font: italic 25px Georgia; 
            color: #333; 
            border-left: 10px solid gray;">
Identify the top algorithms with the highest accuracy and determine the best algorithm.
</div>


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV


In [None]:

# Define all regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Bagging': BaggingRegressor(),
    'Support Vector Regressor': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

# Function to evaluate all regression models
def evaluate_models(X_train, X_test, y_train, y_test):
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        results.append((name, r2, rmse))
    
    # Sort by R² score (higher is better)
    results.sort(key=lambda x: x[1], reverse=True)
    return results


In [None]:
results = evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test)

# Print performance via all models

print("Model Performance: \n")
for name, r2, rmse in results:
    print(f"{name}: R² = {r2:.8f}, RMSE = {rmse:.4f}")


## Linear Regression gives best R^2 score

### Let's tune this more so that R^2 score increases

In [None]:
lr=LinearRegression(fit_intercept=True,copy_X=False,positive=True)
lr.fit(X_train_scaled, y_train)
y_pred1 = lr.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred1)
rmse = mean_squared_error(y_test, y_pred1, squared=False)


In [None]:
r2

In [None]:
rmse

#### Tuning Linear Regression algorithm doesnot effect model performance much

### Let's tune Ridge Regression more so that R^2 score increases

In [None]:
for solver in ['auto','svd', 'cholesky', 'lsqr', 'sag', 'saga']:
    ridge=Ridge(alpha=0.01,solver=solver,max_iter=10000)
    ridge.fit(X_train_scaled, y_train)
    y_pred1 = ridge.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred1)
    rmse = mean_squared_error(y_test, y_pred1, squared=False)
    print(solver,"   r2=",r2,"rmse=",rmse)

In [None]:
ridge=Ridge(alpha=0.01,solver='lbfgs',positive=True)
ridge.fit(X_train_scaled, y_train)
y_pred12 = ridge.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred12)
rmse = mean_squared_error(y_test, y_pred12, squared=False)
print(solver,"   r2=",r2,"rmse=",rmse)

In [None]:
## Clearly saga solver is yielding higher r2 score.

### Let's tune Random Forest more so that R^2 score increases

In [None]:

# Define model
rf = RandomForestRegressor()

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200,250],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Setup GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit on training data
grid_search.fit(X_train_scaled, y_train)

# Best model prediction
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_scaled)

# Calculate R² score
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

### From above we can see that Ridge seems best algorithm from all with saga solver

# Upvote if you found it useful!!