In [0]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import accuracy_score, balanced_accuracy_score

%matplotlib inline

from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

In [0]:
data = pd.read_csv("/Admission_Predict.csv")

In [31]:
data.dtypes

Serial No.             int64
GRE Score              int64
TOEFL Score            int64
University Rating      int64
SOP                  float64
LOR                  float64
CGPA                 float64
Research               int64
Chance of Admit      float64
dtype: object

In [32]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [0]:
df = data.copy()

In [34]:
df.shape

(400, 9)

In [35]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,115.614301,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,100.75,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,200.5,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,300.25,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,400.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [36]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [0]:
X = df.drop("Research", axis = 1)
y = df["Research"]

In [38]:
print(X.head(), "\n")
print(y.head())

   Serial No.  GRE Score  TOEFL Score  ...  LOR   CGPA  Chance of Admit 
0           1        337          118  ...   4.5  9.65              0.92
1           2        324          107  ...   4.5  8.87              0.76
2           3        316          104  ...   3.5  8.00              0.72
3           4        322          110  ...   2.5  8.67              0.80
4           5        314          103  ...   3.0  8.21              0.65

[5 rows x 8 columns] 

0    1
1    1
2    1
3    1
4    0
Name: Research, dtype: int64


In [39]:
print(X.shape)
print(y.shape)

(400, 8)
(400,)


In [40]:
columns = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)
pd.DataFrame(X, columns=columns).describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,6.383782e-17,-3.785861e-16,5.412337e-16,7.147061e-16,1.859624e-16,-3.019807e-16,8.076873e-16,-3.314016e-16
std,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252
min,-1.727726,-2.339367,-2.542098,-1.827457,-2.386613,-2.733036,-3.020504,-2.6985
25%,-0.863863,-0.76859,-0.727492,-0.9520286,-0.8949798,-0.5042604,-0.7201909,-0.5922168
50%,0.0,0.01679859,-0.06763531,-0.07660001,0.0994422,0.05293342,0.01859559,0.03966834
75%,0.863863,0.7149218,0.7571856,0.7988286,0.5966532,0.6101273,0.7783704,0.7417629
max,1.727726,2.023903,2.076899,1.674257,1.591075,1.724515,2.218165,1.724695


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 387235673)

In [42]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(300, 8)
(100, 8)
(300,)
(100,)


In [0]:
def test_model(model):
    print("mean_absolute_error: ", mean_absolute_error(y_test, model.predict(X_test)))
    print("median_absolute_error: ", median_absolute_error(y_test, model.predict(X_test)))
    print("accuracy: ", accuracy_score(y_test, model.predict(X_test).round()))
    print("balanced_accuracy: ", balanced_accuracy_score(y_test, model.predict(X_test).round()))    

In [44]:
gr_100=GradientBoostingRegressor(n_estimators=100)
gr_100.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [45]:
test_model(gr_100)

mean_absolute_error:  0.31569576350996903
median_absolute_error:  0.21688105365467764
accuracy:  0.75
balanced_accuracy:  0.7521075873143316


In [46]:
ran_100 = RandomForestRegressor(n_estimators=100)
ran_100.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [47]:
test_model(ran_100)

mean_absolute_error:  0.297
median_absolute_error:  0.205
accuracy:  0.72
balanced_accuracy:  0.7213970293054998
