# 1- Install Dependencies & Run a SparkSession

In [1]:
! pip install pyspark



In [2]:
# create a sparksession

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('data_processing').getOrCreate()

In [4]:
spark

# 2- Clone & Explore Dataset

In [None]:
# clone the Dataset
! git clone https://github.com/education454/admission_dataset

In [10]:
# check the dataset
! ls admission_dataset

Admission_Predict_Ver1.1.csv


In [11]:
# Create a spark dataframe                                                                # inferSchema ==> define the type of each column
df = spark.read.csv('/content/admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

In [13]:
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [16]:
# num of rows & columns

print(f"Totel number of rows {df.count()}\nTotel number of columns {len(df.columns)}" )

Totel number of rows 500
Totel number of columns 9


In [17]:
# print schema
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [18]:
# get the summary statistics

df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# 3- Data Cleaning

In [19]:
# drop the unnecesssary column
df = df.drop('Serial No')

In [23]:
# check for null values
for col in df.columns:
  print(col, df[df[col].isNull()].count())

GRE Score 0
TOEFL Score 0
University Rating 0
SOP 0
LOR 0
CGPA 0
Research 0
Chance of Admit 0


# 4- Correlation Analysis & Feature Selection

our output is Chance of Admit

In [24]:
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [25]:
# correlation analysis
for col in df.columns:
  print(f"The correlation to chance of admit for {col} is {df.stat.corr('Chance of Admit', col)}")

The correlation to chance of admit for GRE Score is 0.8103506354632598
The correlation to chance of admit for TOEFL Score is 0.7922276143050823
The correlation to chance of admit for University Rating is 0.6901323687886892
The correlation to chance of admit for SOP is 0.6841365241316723
The correlation to chance of admit for LOR is 0.6453645135280112
The correlation to chance of admit for CGPA is 0.882412574904574
The correlation to chance of admit for Research is 0.5458710294711379
The correlation to chance of admit for Chance of Admit is 1.0


we can see that the greatest correlation value is:


1.   CBPA
2.   GRE
3.   TOEFL Score

And we can see that Researching is the least effect of Chance of admit


In [27]:
# feature selection
# mearge columns to vector
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score', 'TOEFL Score', 'CGPA'], outputCol='features')
print(assembler)

VectorAssembler_6f180a74cdbe


In [29]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

#5- Build the Linear Regression Model

In [30]:
# import LR and create final data
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features', 'Chance of Admit')

In [31]:
# print the scema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [32]:
# split the data into training and testing set

train, test = final_data.randomSplit([0.7, 0.3])

In [33]:
#build & train the model
models = LinearRegression(featuresCol='features', labelCol='Chance of Admit')
model = models.fit(train)

In [35]:
# get coefficients & intercept
print(f"coefficients: {model.coefficients}")
print(f"intercept: {model.intercept}")

coefficients: [0.0022615727113056987,0.0036704569704190705,0.13896874037397614]
intercept: -1.5791628235113881


In [37]:
# get summary of the model
summary = model.summary


In [38]:
print(f"RMSE: {summary.rootMeanSquaredError}") # smaller is better
print(f"r2: {summary.r2}")# hiegher is better

RMSE: 0.06072414719789489
r2: 0.813148337929058


#6- Evaluate & Save the model

In [39]:
# transform on the test data
predictions = model.transform(test)

In [40]:
# display the predictions
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|  [293.0,97.0,7.8]|           0.64| 0.5234684819488451|
| [294.0,93.0,7.36]|           0.46| 0.4499019810139253|
| [295.0,99.0,7.65]|           0.57| 0.5144872302561985|
| [296.0,95.0,7.54]|           0.44| 0.4867804136446905|
|[296.0,101.0,7.68]|            0.6| 0.5282587791195614|
| [297.0,96.0,7.43]|           0.34|0.47742588188527746|
| [297.0,96.0,7.89]|           0.43| 0.5413515024573066|
| [297.0,98.0,7.67]|           0.59| 0.5181192935158703|
|[297.0,101.0,7.67]|           0.57| 0.5291306644271274|
| [298.0,97.0,7.21]|           0.45| 0.4527847886847276|
|  [298.0,98.0,7.5]|           0.44|0.49675618036360003|
| [298.0,98.0,8.03]|           0.34| 0.5704096127618072|
| [298.0,99.0,7.46]|           0.53| 0.4948678877190602|
| [299.0,96.0,7.86]|           0.54| 0.5417055856686992|
| [299.0,97.0,7.66]|           

In [41]:
# evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')
evaluator.evaluate(predictions)

0.7864499772914

In [42]:
# save the model
model.save('Admission Prediction model')

In [43]:
# loade the modle
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('Admission Prediction model')


In [46]:
# use the model
model.transform(test).show()


+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|  [293.0,97.0,7.8]|           0.64| 0.5234684819488451|
| [294.0,93.0,7.36]|           0.46| 0.4499019810139253|
| [295.0,99.0,7.65]|           0.57| 0.5144872302561985|
| [296.0,95.0,7.54]|           0.44| 0.4867804136446905|
|[296.0,101.0,7.68]|            0.6| 0.5282587791195614|
| [297.0,96.0,7.43]|           0.34|0.47742588188527746|
| [297.0,96.0,7.89]|           0.43| 0.5413515024573066|
| [297.0,98.0,7.67]|           0.59| 0.5181192935158703|
|[297.0,101.0,7.67]|           0.57| 0.5291306644271274|
| [298.0,97.0,7.21]|           0.45| 0.4527847886847276|
|  [298.0,98.0,7.5]|           0.44|0.49675618036360003|
| [298.0,98.0,8.03]|           0.34| 0.5704096127618072|
| [298.0,99.0,7.46]|           0.53| 0.4948678877190602|
| [299.0,96.0,7.86]|           0.54| 0.5417055856686992|
| [299.0,97.0,7.66]|           