In [1]:
#Admission Prediction Problem (Regression Problem)

import pandas as pd

df=pd.read_csv(r'D:\BigData_All_content\csv\Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [2]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,115.614301,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,100.75,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,200.5,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,300.25,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,400.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [3]:
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR', 'CGPA', 'Research', 'Chance of Admit'],
      dtype='object')

In [4]:
df.dtypes

Serial No.             int64
GRE Score              int64
TOEFL Score            int64
University Rating      int64
SOP                  float64
LOR                  float64
CGPA                 float64
Research               int64
Chance of Admit      float64
dtype: object

In [5]:
#Deleting 'Serial No.' because it does not play any significant role in ML

del df['Serial No.']
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [6]:
#Preparing independent/input features/variables

X=df.drop('Chance of Admit', axis=1)
X.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.0,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0


In [7]:
#Preparing dependent/outcome/output feature/varialbe/column

y=df['Chance of Admit']
y.head()


0    0.92
1    0.76
2    0.72
3    0.80
4    0.65
Name: Chance of Admit, dtype: float64

In [8]:
#Checking missing values in X

X.isnull().sum()

GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
dtype: int64

In [9]:
X.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.0,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0


In [10]:
#Feature Scaling of numerical features using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
mx=MinMaxScaler(feature_range=(0,1))

#Scaling all features/columns of X

X[X.columns]=mx.fit_transform(X[X.columns])
X.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,0.94,0.928571,0.75,0.875,0.875,0.913462,1.0
1,0.68,0.535714,0.75,0.75,0.875,0.663462,1.0
2,0.52,0.428571,0.5,0.5,0.625,0.384615,1.0
3,0.64,0.642857,0.5,0.625,0.375,0.599359,1.0
4,0.48,0.392857,0.25,0.25,0.5,0.451923,0.0


In [11]:
#splitting the data into training and testing set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X, y,test_size=0.2)

#since, test_size=0.2. So 80% of the data will be initialized to training set and 20% of the data
#will be initialized to testing set.

#The default value of test_size is 0.25

In [12]:
print(X.shape)
print(y.shape)

(400, 7)
(400,)


In [13]:
#X_train= 80% data of X =320
#X_test= 20% data of X= 80

#y_train= 80% data of y= 320 rows
#y_test= 20% data of y= 80 rows

In [14]:
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)


(320, 7)
(80, 7)
(320,)
(80,)


In [15]:
X_train.head(10)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
294,0.52,0.321429,0.25,0.375,0.25,0.487179,1.0
69,0.76,0.821429,0.75,0.875,0.75,0.75641,1.0
100,0.64,0.535714,0.5,0.625,0.625,0.532051,1.0
329,0.14,0.142857,0.25,0.375,0.125,0.349359,0.0
199,0.46,0.535714,0.5,0.75,0.875,0.605769,0.0
158,0.32,0.5,0.25,0.25,0.375,0.429487,0.0
262,0.36,0.392857,0.25,0.375,0.75,0.5,1.0
75,0.78,0.785714,0.25,0.25,0.75,0.564103,1.0
10,0.7,0.5,0.5,0.625,0.75,0.512821,1.0
247,0.42,0.428571,0.25,0.375,0.625,0.538462,0.0


In [16]:
X_test.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
245,0.76,0.642857,0.75,0.75,0.375,0.711538,1.0
128,0.72,0.714286,0.5,0.625,0.5,0.737179,1.0
288,0.48,0.428571,0.75,1.0,1.0,0.711538,0.0
130,0.98,0.785714,1.0,0.75,0.875,0.948718,1.0
254,0.62,0.785714,0.75,0.75,1.0,0.74359,0.0


In [17]:
y_train.head()

294    0.61
69     0.78
100    0.71
329    0.43
199    0.72
Name: Chance of Admit, dtype: float64

In [18]:
y_test.head()

245    0.81
128    0.84
288    0.82
130    0.96
254    0.85
Name: Chance of Admit, dtype: float64

In [19]:
#Now, our data is ready for machine learning

#We can train our model to solve this regression problem

#Applying LinearRegression algo/model on training data i.e. X_train and y_train

from sklearn.linear_model import LinearRegression

#Creating object of LinearRegression

LR=LinearRegression()

In [20]:
#Training the LinearRegression algo using training data ie X_train, y_train

LR.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
#Testing the LinearRegression model/algo using testing data i.e. X_test, y_test

LR.score(X_test, y_test)

0.8436576324370326

In [22]:
#Here, score is 0.7725 or ~77.25%

#It means that the LinearRegression predicts the value of y_test on X_test with 77% of accuracy

#Working of score()

#In score(), first predict() is called to predict the outcome of y_test on the basis of X_test

#Then, It compares the predicted outcome with the actual outcome.

"""
Actual outcome        Predicted outcome 
y_test                  y_pred

0.6                 0.6
0.5                 0.5
0.7                 0.9
0.8                 0.8

We can say that actual outcome and predicted outcome of the algorithm is matching at 03 places.
So accuracy is 3/4: 0.75 or 75%
"""

'\nActual outcome        Predicted outcome \ny_test                  y_pred\n\n0.6                 0.6\n0.5                 0.5\n0.7                 0.9\n0.8                 0.8\n\nWe can say that actual outcome and predicted outcome of the algorithm is matching at 03 places.\nSo accuracy is 3/4: 0.75 or 75%\n'

In [24]:
#Applying RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor

rfr=RandomForestRegressor()

rfr.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [25]:
#Applying DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor()
dtr.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [27]:
print('Score of LinearRegresson:', LR.score(X_test, y_test))
print('Score of RandomForestRegssor:', rfr.score(X_test, y_test))
print('Score of DecisionTreeRegressor:', dtr.score(X_test, y_test))

Score of LinearRegresson: 0.8436576324370326
Score of RandomForestRegssor: 0.7985640337084607
Score of DecisionTreeRegressor: 0.6656714308428882


In [28]:
#The score of LinearRegression is highest ie ~84%

#So, on the basis of score, we can select LinearRegression for implementation

In [29]:
#Implementing the LinearRegression algo on new data

X.dtypes

GRE Score            float64
TOEFL Score          float64
University Rating    float64
SOP                  float64
LOR                  float64
CGPA                 float64
Research             float64
dtype: object

In [30]:
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [31]:
#Inputting the values for prediction using LinearRegression

gre=float(input('Enter GRE Score between 290 to 340:'))
toefl=float(input('Enter TOEFL Score b/w 92 to 120='))
rating=float(input("Enter University Rating between 1 and 5="))
sop=float(input('Enter SOP between 1 and 5='))
lor=float(input('Enter LOR betw 1 and 5='))
cgpa=float(input('Enter CGPA bewtween 6.8 to 9.92='))
research=float(input('Enter Research 0- for NO and 1-for Yes'))

Enter GRE Score between 290 to 340:325
Enter TOEFL Score b/w 92 to 120=110
Enter University Rating between 1 and 5=4
Enter SOP between 1 and 5=5
Enter LOR betw 1 and 5=4
Enter CGPA bewtween 6.8 to 9.92=8.3
Enter Research 0- for NO and 1-for Yes1


In [32]:
type(X_train)

pandas.core.frame.DataFrame

In [33]:
X_train.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research'],
      dtype='object')

In [34]:
#Now, Prepare the new data as a dataframe because we have given the training data in fit() as a dataframe
#Order and name of the columns in new dataframe must match with the training data

#Trasforming the new data as a dataframe

#Arrange all the individual input variables in a list

value=[[gre, toefl, rating, sop, lor, cgpa, research]]

newdf=pd.DataFrame(value, columns=['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research'])


In [35]:
newdf

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,325.0,110.0,4.0,5.0,4.0,8.3,1.0


In [37]:
X_train.head(1)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
294,0.52,0.321429,0.25,0.375,0.25,0.487179,1.0


In [38]:
#Next, Scale the new data using MinMaxScaler because we have scaled the training data using
#MinMaxScaler

#Don't fit the new data in MinMaxScaler. Just transform the new data using object of MinMaxScaler

newdf[newdf.columns]=mx.transform(newdf[newdf.columns])
newdf

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,0.7,0.642857,0.75,1.0,0.75,0.480769,1.0


In [39]:
#Now, your data is ready for prediction

#Predicting the value of 'Chance of Admit' on this new data using LinearRegression alog/model

new_pr=LR.predict(newdf)
new_pr

array([0.73016158])

In [44]:
print('Your Chance of Admit is:', new_pr[0])

Your Chance of Admit is: 0.730161580403978


In [45]:
import numpy as np
print('Your Chance of Admit is:', np.around(new_pr[0],2)*100, "%")

Your Chance of Admit is: 73.0 %
