# TASK 1 : Install Dependencies & Run a SparkSession


In [2]:
#install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 33 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 48.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=a2e394abcdbebb5af6b6b355f23d27ae0130af75215910ca4ac0b94e12f38341
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [4]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark").getOrCreate()

# TASK 2 : Clone & Explore dataset

In [5]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), done.


In [10]:
#check the presence of dataset
!ls /content/admission_dataset

Admission_Predict_Ver1.1.csv


In [11]:
#create a spark dataframe
df = spark.read.csv('/content/admission_dataset/Admission_Predict_Ver1.1.csv',header=True,inferSchema=True)

In [12]:
#display dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [16]:
#get the no.of rows & columns
print(df.count(),len(df.columns))

500 9


In [19]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [21]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [22]:
#drop the unnecessary column
df = df.drop('Serial No')

In [23]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [24]:
#check for null values
for i in df.columns:
  print(i,":",df[df[i].isNull()].count())


GRE Score : 0
TOEFL Score : 0
University Rating : 0
SOP : 0
LOR : 0
CGPA : 0
Research : 0
Chance of Admit : 0


# TASK 4 : Correlation Analysis & Feature Selection

In [26]:
# correlation analysis
for col in df.columns:
  print(col,df.stat.corr('Chance of Admit',col))

GRE Score 0.8103506354632598
TOEFL Score 0.7922276143050823
University Rating 0.6901323687886892
SOP 0.6841365241316723
LOR 0.6453645135280112
CGPA 0.882412574904574
Research 0.5458710294711379
Chance of Admit 1.0


In [51]:
# feature selection
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=['GRE Score','TOEFL Score','CGPA'],outputCol='features')

In [52]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# TASK 5 : Build the Linear Regression Model

In [53]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_data =output_data.select('features','Chance of Admit')

In [54]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [55]:
#split the dataset into training and testing set
train,test = final_data.randomSplit([0.7,0.3])

In [56]:
#build & train the model
models =LinearRegression(featuresCol='features',labelCol='Chance of Admit')
model = models.fit(train)

In [57]:
#get coefficients & intercept
print(model.coefficients)
print(model.intercept)

[0.00264082007954714,0.00269172595202036,0.14838153932115072]
-1.6781063124086217


In [58]:
#get summary of the model
model.summary.rootMeanSquaredError

0.06529634838853764

In [59]:
#print the rmse & r2 score
model.summary.r2

0.802163770152631

# TASK 6 : Evaluate & Save the Model

In [60]:
#transform on the test data
predictions = model.transform(test)

In [62]:
#display the predictions
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
| [294.0,93.0,7.36]|           0.46| 0.4407134339198002|
| [295.0,96.0,7.34]|           0.47|0.44846180106898537|
|[295.0,101.0,7.86]|           0.69| 0.5390788312760857|
| [296.0,99.0,8.03]|           0.61| 0.5615610611361874|
|[296.0,101.0,7.68]|            0.6| 0.5150109742778257|
| [297.0,100.0,7.9]|           0.52| 0.5476040070560058|
| [298.0,97.0,7.21]|           0.45|0.43978638714789753|
|  [298.0,98.0,7.5]|           0.44|0.48550875950305183|
|[298.0,101.0,7.69]|           0.53| 0.5217764298301315|
|[298.0,101.0,7.86]|           0.54| 0.5470012915147269|
| [299.0,96.0,7.86]|           0.54| 0.5361834818341724|
|[299.0,100.0,7.88]|           0.68|  0.549918016428677|
|[299.0,100.0,8.02]|           0.63| 0.5706914319336378|
| [300.0,95.0,8.22]|           0.62| 0.5895499301173137|
|[300.0,100.0,8.66]|           

In [64]:

#evaluate the model 
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='Chance of Admit',metricName='r2')
evaluator.evaluate(predictions)

0.8071496920615432

In [65]:
#save the model
model.save('model')

In [71]:
#load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('model')
model.transform(test).show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
| [294.0,93.0,7.36]|           0.46| 0.4407134339198002|
| [295.0,96.0,7.34]|           0.47|0.44846180106898537|
|[295.0,101.0,7.86]|           0.69| 0.5390788312760857|
| [296.0,99.0,8.03]|           0.61| 0.5615610611361874|
|[296.0,101.0,7.68]|            0.6| 0.5150109742778257|
| [297.0,100.0,7.9]|           0.52| 0.5476040070560058|
| [298.0,97.0,7.21]|           0.45|0.43978638714789753|
|  [298.0,98.0,7.5]|           0.44|0.48550875950305183|
|[298.0,101.0,7.69]|           0.53| 0.5217764298301315|
|[298.0,101.0,7.86]|           0.54| 0.5470012915147269|
| [299.0,96.0,7.86]|           0.54| 0.5361834818341724|
|[299.0,100.0,7.88]|           0.68|  0.549918016428677|
|[299.0,100.0,8.02]|           0.63| 0.5706914319336378|
| [300.0,95.0,8.22]|           0.62| 0.5895499301173137|
|[300.0,100.0,8.66]|           