# PySpark Linear Regression and Random Forest Regression


In [5]:
from pyspark.sql import SparkSession     # SparkSession is created for a SQL data frame. Spark Context is created for a RDD's.
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
# Dataframe - ml package
# RDD - mllib package.

spark = SparkSession.builder.appName("PySpark ML Algorithms").getOrCreate() # read or write!

dataframe = spark.read.csv("C:/Users/mitta/Downloads/admission_pred_pyspark/Admission_Predict.csv",header=True)


In [7]:
type(dataframe)

pyspark.sql.dataframe.DataFrame

In [8]:
dataframe.show()

+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|         2|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|         3|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|         5|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|         6|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|         7|      321|        109|                3|  3|   4| 8.2|       1|            0.75|
|         8|      308|        101|                2|  3|   4| 7.9|    

In [9]:
dataframe.printSchema()

dataframe.columns


root
 |-- Serial No.: string (nullable = true)
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR : string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit : string (nullable = true)



['Serial No.',
 'GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ']

In [11]:
dataframe=dataframe.drop('Serial No.')

In [12]:
from pyspark.sql.functions import col

for c in dataframe.columns:
    print((col(c))) 

dataframe.select(*(col(c) for c in dataframe.columns)).show()

new_dataframe=dataframe.select(*(col(c).cast('float') for c in dataframe.columns))

new_dataframe.printSchema()


Column<b'GRE Score'>
Column<b'TOEFL Score'>
Column<b'University Rating'>
Column<b'SOP'>
Column<b'LOR '>
Column<b'CGPA'>
Column<b'Research'>
Column<b'Chance of Admit '>
+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|      321|        109|                3|  3|   4| 8.2|       1|            0.75|
|      308| 

In [13]:
from pyspark.sql.functions import col, count, isnan, when

new_dataframe.select([count(when(col(c).isNull(), c)).alias(c) for c in new_dataframe.columns]).show()


+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



In [14]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=["GRE Score","TOEFL Score","University Rating"],
                 outputCols=['GRE Scores',"TOEFL Scores","Universities Ranking"])

model = imputer.fit(new_dataframe)
imputed_data = model.transform(new_dataframe)

imputed_data.show()


+---------+-----------+-----------------+---+----+----+--------+----------------+----------+------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |GRE Scores|TOEFL Scores|Universities Ranking|
+---------+-----------+-----------------+---+----+----+--------+----------------+----------+------------+--------------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|     337.0|       118.0|                 4.0|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|     324.0|       107.0|                 4.0|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|     316.0|       104.0|                 3.0|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|     322.0|       110.0|                 3.0|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|            0.65|     314.0|       103.0|      

In [15]:
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_dataframe.columns]).show()


+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



In [16]:

features = imputed_data.drop("Chance of Admit")

features.show()

features.columns


+---------+-----------+-----------------+---+----+----+--------+----------------+----------+------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |GRE Scores|TOEFL Scores|Universities Ranking|
+---------+-----------+-----------------+---+----+----+--------+----------------+----------+------------+--------------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|     337.0|       118.0|                 4.0|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|     324.0|       107.0|                 4.0|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|     316.0|       104.0|                 3.0|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|     322.0|       110.0|                 3.0|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|            0.65|     314.0|       103.0|      

['GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ',
 'GRE Scores',
 'TOEFL Scores',
 'Universities Ranking']

In [17]:
assembler = VectorAssembler(inputCols=features.columns,outputCol="features") #all the column features into a single feature column.

output = assembler.transform(imputed_data)

output.show()

output.columns


+---------+-----------+-----------------+---+----+----+--------+----------------+----------+------------+--------------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |GRE Scores|TOEFL Scores|Universities Ranking|            features|
+---------+-----------+-----------------+---+----+----+--------+----------------+----------+------------+--------------------+--------------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|     337.0|       118.0|                 4.0|[337.0,118.0,4.0,...|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|     324.0|       107.0|                 4.0|[324.0,107.0,4.0,...|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|     316.0|       104.0|                 3.0|[316.0,104.0,3.0,...|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|     322.0|       110.0|       

['GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ',
 'GRE Scores',
 'TOEFL Scores',
 'Universities Ranking',
 'features']

In [18]:

data = output.select("features","Chance of Admit ")

data.show()

train_df,test_df = data.randomSplit([0.7,0.3])

train_df.show()
test_df.show()



+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[337.0,118.0,4.0,...|            0.92|
|[324.0,107.0,4.0,...|            0.76|
|[316.0,104.0,3.0,...|            0.72|
|[322.0,110.0,3.0,...|             0.8|
|[314.0,103.0,2.0,...|            0.65|
|[330.0,115.0,5.0,...|             0.9|
|[321.0,109.0,3.0,...|            0.75|
|[308.0,101.0,2.0,...|            0.68|
|[302.0,102.0,1.0,...|             0.5|
|[323.0,108.0,3.0,...|            0.45|
|[325.0,106.0,3.0,...|            0.52|
|[327.0,111.0,4.0,...|            0.84|
|[328.0,112.0,4.0,...|            0.78|
|[307.0,109.0,3.0,...|            0.62|
|[311.0,104.0,3.0,...|            0.61|
|[314.0,105.0,3.0,...|            0.54|
|[317.0,107.0,3.0,...|            0.66|
|[319.0,106.0,3.0,...|            0.65|
|[318.0,110.0,3.0,...|            0.63|
|[303.0,102.0,3.0,...|            0.62|
+--------------------+----------------+
only showing top 20 rows

+-------------

#  Linear Regression 

In [20]:

lin_reg = LinearRegression(featuresCol='features',labelCol='Chance of Admit ')
linear_model = lin_reg.fit(train_df)

print("Coefficients :",linear_model.coefficients)
print('Intercept',linear_model.intercept)

trainSummary = linear_model.summary
print('RMSE',trainSummary.rootMeanSquaredError)
print('r2Score',trainSummary.r2)

predictions = linear_model.transform(test_df)


predictions.show()

predictions.select("prediction","Chance of Admit ","features").show()

from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="Chance of Admit ",metricName="r2")
print("R2 on test data",pred_evaluator.evaluate(predictions))



Coefficients : [1.8586300354498898e-10,-2.7760218481119284e-10,-1.5207964295661626e-09,4.600042259693567e-10,-5.603155918095798e-09,-5.6557285988872715e-09,-9.335228461335585e-09,1.0000000991234896,1.8586300354498898e-10,-2.7760218481119284e-10,-1.5207964295661626e-09]
Intercept -4.887341327902246e-08
RMSE 8.30864767265681e-09
r2Score 0.9999999999999968
+--------------------+----------------+-------------------+
|            features|Chance of Admit |         prediction|
+--------------------+----------------+-------------------+
|[296.0,99.0,2.0,2...|            0.61|  0.610000016606133|
|[297.0,96.0,2.0,2...|            0.34| 0.3399999873463074|
|[297.0,98.0,2.0,2...|            0.59| 0.5899999742539135|
|[298.0,99.0,2.0,4...|            0.46|0.46000000244414313|
|[298.0,105.0,3.0,...|            0.69| 0.6899999913881903|
|[299.0,100.0,2.0,...|            0.68|  0.680000012427396|
|[299.0,106.0,2.0,...|            0.64| 0.6399999783910063|
|[300.0,100.0,3.0,...|            0.64| 0.63

# RandomForest Regressor


In [21]:
random_forest_reg = RandomForestRegressor(featuresCol="features",labelCol="Chance of Admit ")

model = random_forest_reg.fit(train_df)

predictions = model.transform(test_df)

predictions.show()

evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="Chance of Admit ",metricName="rmse")
print("RMSE",evaluator.evaluate(predictions))

evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="Chance of Admit ",metricName="r2")
print("R2",evaluator.evaluate(predictions))

+--------------------+----------------+-------------------+
|            features|Chance of Admit |         prediction|
+--------------------+----------------+-------------------+
|[296.0,99.0,2.0,2...|            0.61| 0.6059462719280259|
|[297.0,96.0,2.0,2...|            0.34|0.43843584887998305|
|[297.0,98.0,2.0,2...|            0.59| 0.5879400680066689|
|[298.0,99.0,2.0,4...|            0.46| 0.4402631940445027|
|[298.0,105.0,3.0,...|            0.69| 0.6605824951600237|
|[299.0,100.0,2.0,...|            0.68| 0.6467004207642877|
|[299.0,106.0,2.0,...|            0.64| 0.6433625628044864|
|[300.0,100.0,3.0,...|            0.64| 0.6557267128232172|
|[300.0,102.0,2.0,...|            0.56| 0.5652009157227015|
|[301.0,97.0,2.0,3...|            0.44|  0.456473733538988|
|[301.0,100.0,3.0,...|            0.67| 0.6530841765530984|
|[301.0,104.0,3.0,...|            0.68| 0.6753116211388167|
|[302.0,101.0,2.0,...|            0.46| 0.4732570785184212|
|[303.0,99.0,3.0,2...|            0.36|0