In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
admission_dataset = pd.read_csv('./datasets/admission_prediction.csv')

In [6]:
admission_df = admission_dataset.copy()
admission_df.drop(columns=['Serial No.'], inplace=True)

In [7]:
admission_df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,485.0,490.0,485.0,500.0,500.0,500.0,500.0,500.0
mean,316.558763,107.187755,3.121649,3.374,3.484,8.57644,0.56,0.72174
std,11.274704,6.112899,1.14616,0.991004,0.92545,0.604813,0.496884,0.14114
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [8]:
admission_df.isnull().sum()

GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [9]:
admission_df['GRE Score'].fillna(admission_df['GRE Score'].mean(), inplace=True)
admission_df['TOEFL Score'].fillna(admission_df['TOEFL Score'].mean(), inplace=True)
admission_df['University Rating'].fillna(admission_df['University Rating'].mode()[0], inplace=True)

In [10]:
admission_df.isnull().sum()

GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [11]:
x = admission_df.drop(columns=['Chance of Admit'])
y = admission_df['Chance of Admit']

In [12]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(x)
scaled_df = pd.DataFrame(scaled_data, columns=x.columns)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.2, random_state=42)

In [15]:
model = LinearRegression()
model.fit(x_train, y_train)

In [19]:
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

adjusted_r2(model.score(x_train, y_train), x_train.shape[0], x_train.shape[1])

0.8159905863706703

In [20]:
print('Model Coefficients:', model.coef_)
print('Model Intercept:', model.intercept_)

Model Coefficients: [0.02524921 0.01775021 0.00266344 0.00191069 0.01556871 0.07016525
 0.0129387 ]
Model Intercept: 0.7229987433222765


In [30]:
lasso_cv_model = LassoCV(cv=10, random_state=42, max_iter=2000000)
lasso_cv_model.fit(x_train, y_train)

In [31]:
lasso_cv_model.alpha_

0.0003254627497629299

In [34]:
lasso_model = Lasso(alpha=lasso_cv_model.alpha_)
lasso_model.fit(x_train, y_train)

In [35]:
lasso_model.score(x_test, y_test)

0.8210636270030428