In [1]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=5185cdee47520345569434fd1590b05554b11521186682ec89de4d42b5811339
  Stored in directory: /root/.cache/pip/wheels/b1/59/a0/a1a0624b5e865fd389919c1a10f53aec9b12195d6747710baf
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
from pyspark.sql import SparkSession

spark= SparkSession.builder.appName("ml_project").getOrCreate()

In [3]:
spark

In [6]:
df = spark.read.csv('Data.csv', header=True, inferSchema=True)

In [7]:
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [8]:
shape = (df.count(), len(df.columns))

print(shape)

(500, 9)


In [9]:
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [10]:
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

In [11]:
df= df.drop('Serial No')

In [12]:
df.show(2)

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
+---------+-----------+-----------------+---+---+----+--------+---------------+
only showing top 2 rows



In [13]:
print(df[df['GRE Score'].isNull()].count())

0


In [14]:
for col in df.columns:
  print(col + ':' , df[df[col].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


Correlation Analysis & Feature Selection

In [15]:
print(df.stat.corr('GRE Score', 'Chance of Admit'))

0.8103506354632601


In [16]:
for col in df.columns:
  print(f"{col} is {round(df.stat.corr(col, 'Chance of Admit'),3)} correlated with the target variable Chance of Admit")

GRE Score is 0.81 correlated with the target variable Chance of Admit
TOEFL Score is 0.792 correlated with the target variable Chance of Admit
University Rating is 0.69 correlated with the target variable Chance of Admit
SOP is 0.684 correlated with the target variable Chance of Admit
LOR is 0.645 correlated with the target variable Chance of Admit
CGPA is 0.882 correlated with the target variable Chance of Admit
Research is 0.546 correlated with the target variable Chance of Admit
Chance of Admit is 1.0 correlated with the target variable Chance of Admit


In [17]:
from pyspark.ml.feature import VectorAssembler 

assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score', 'CGPA'],outputCol='features' )

In [18]:
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

Linear Regression Model

In [19]:
from pyspark.ml.regression import LinearRegression

final_data = output_data.select('features', 'Chance of Admit')

In [20]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [21]:
train, test = final_data.randomSplit([0.7,0.3])

In [22]:
models = LinearRegression(featuresCol= 'features', labelCol='Chance of Admit')

model= models.fit(train)

In [23]:
print('Coefficients:', model.coefficients)

print('Intercept:', model.intercept)

Coefficients: [0.0019124539578525624,0.004078689893835469,0.1374351890491344]
Intercept: -1.4981924860871123


In [24]:
summary = model.summary

In [25]:
print('RMSE :', summary.rootMeanSquaredError)

print('r2 :', summary.r2)

RMSE : 0.06205375180391571
r2 : 0.8019217212142227


Evaluate & Save the Model

In [26]:
predictions= model.transform(test)

In [27]:
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
| [294.0,95.0,7.64]|           0.49| 0.5015493617712974|
| [295.0,99.0,7.57]|           0.37| 0.5101561120710527|
|  [296.0,97.0,7.8]|           0.49|  0.535521279722535|
| [296.0,99.0,7.28]|           0.47|0.47221236120465604|
| [297.0,96.0,7.89]|           0.43| 0.5457242108009743|
| [297.0,100.0,7.9]|           0.52| 0.5634133222668076|
|[297.0,101.0,7.67]|           0.57| 0.5358819186793422|
| [298.0,92.0,7.88]|           0.51| 0.5299475532929936|
| [298.0,97.0,7.21]|           0.45|0.45825942609925074|
| [298.0,99.0,7.46]|           0.53| 0.5007756031492054|
|  [298.0,99.0,7.6]|           0.46| 0.5200165296160844|
|[298.0,100.0,7.95]|           0.58| 0.5721975356771165|
|[298.0,101.0,7.86]|           0.54| 0.5639070585565302|
|[298.0,105.0,8.54]|           0.69| 0.6736777466852835|
| [299.0,97.0,7.66]|           

In [28]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit', metricName='r2')

print('r2 score:', evaluator.evaluate(predictions))

r2 score: 0.8065699499650063
