In [301]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [302]:
spark=SparkSession.builder.appName("Linear Reggression").getOrCreate()

In [303]:
spark_df=spark.read.csv("/home/hdoop/Downloads/Salary_Data.csv",header=True,inferSchema=True)

In [304]:
spark_df.describe().show()

+-------+------------------+------------------+
|summary|   YearsExperience|            Salary|
+-------+------------------+------------------+
|  count|                30|                30|
|   mean|5.3133333333333335|           76003.0|
| stddev| 2.837888157662718|27414.429784582302|
|    min|               1.1|           37731.0|
|    max|              10.5|          122391.0|
+-------+------------------+------------------+



In [305]:
spark_df.summary()

DataFrame[summary: string, YearsExperience: string, Salary: string]

In [306]:
spark_df.printSchema()

root
 |-- YearsExperience: double (nullable = true)
 |-- Salary: double (nullable = true)



In [307]:
spark_df.show()

+---------------+-------+
|YearsExperience| Salary|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
|            2.9|56642.0|
|            3.0|60150.0|
|            3.2|54445.0|
|            3.2|64445.0|
|            3.7|57189.0|
|            3.9|63218.0|
|            4.0|55794.0|
|            4.0|56957.0|
|            4.1|57081.0|
|            4.5|61111.0|
|            4.9|67938.0|
|            5.1|66029.0|
|            5.3|83088.0|
|            5.9|81363.0|
|            6.0|93940.0|
+---------------+-------+
only showing top 20 rows



Creating the model

In [308]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler


Creating a vector form

In [309]:
assembler = VectorAssembler(inputCols=["YearsExperience"], outputCol="features")

# Assemble the features
spark_df = assembler.transform(spark_df)


In [310]:
spark_df.describe().show()

+-------+------------------+------------------+
|summary|   YearsExperience|            Salary|
+-------+------------------+------------------+
|  count|                30|                30|
|   mean|5.3133333333333335|           76003.0|
| stddev| 2.837888157662718|27414.429784582302|
|    min|               1.1|           37731.0|
|    max|              10.5|          122391.0|
+-------+------------------+------------------+



In [311]:
spark_df.show()

+---------------+-------+--------+
|YearsExperience| Salary|features|
+---------------+-------+--------+
|            1.1|39343.0|   [1.1]|
|            1.3|46205.0|   [1.3]|
|            1.5|37731.0|   [1.5]|
|            2.0|43525.0|   [2.0]|
|            2.2|39891.0|   [2.2]|
|            2.9|56642.0|   [2.9]|
|            3.0|60150.0|   [3.0]|
|            3.2|54445.0|   [3.2]|
|            3.2|64445.0|   [3.2]|
|            3.7|57189.0|   [3.7]|
|            3.9|63218.0|   [3.9]|
|            4.0|55794.0|   [4.0]|
|            4.0|56957.0|   [4.0]|
|            4.1|57081.0|   [4.1]|
|            4.5|61111.0|   [4.5]|
|            4.9|67938.0|   [4.9]|
|            5.1|66029.0|   [5.1]|
|            5.3|83088.0|   [5.3]|
|            5.9|81363.0|   [5.9]|
|            6.0|93940.0|   [6.0]|
+---------------+-------+--------+
only showing top 20 rows



In [312]:
# Split the data into train and test sets
train_df, test_df = spark_df.randomSplit([0.8, 0.2])

In [313]:
test_df.show()

+---------------+-------+--------+
|YearsExperience| Salary|features|
+---------------+-------+--------+
|            1.1|39343.0|   [1.1]|
|            2.2|39891.0|   [2.2]|
|            3.7|57189.0|   [3.7]|
|            3.9|63218.0|   [3.9]|
|            4.0|56957.0|   [4.0]|
|            5.3|83088.0|   [5.3]|
|            5.9|81363.0|   [5.9]|
+---------------+-------+--------+



In [314]:
type(train_df)

pyspark.sql.dataframe.DataFrame

In [315]:
train_df.printSchema()

root
 |-- YearsExperience: double (nullable = true)
 |-- Salary: double (nullable = true)
 |-- features: vector (nullable = true)



In [316]:
print(test_df.count())
l=len(test_df.columns)
print(l)

7
3


In [317]:
print(train_df.count())
l=len(train_df.columns)
print(l)

23
3


Creating Linear reggression Model

In [318]:
# Create the Linear Regression model
regr = LinearRegression(featuresCol='features', labelCol='Salary', regParam=0.0, solver="normal")

# Fit the model to the training data
regr_model = regr.fit(train_df)


23/07/24 16:53:59 WARN Instrumentation: [1a1d9360] regParam is zero, which might cause numerical instability and overfitting.


Predecting Salary By years of experince

In [319]:
# Make predictions on the test data
predictions = regr_model.transform(test_df)

# Show the predicted values
predictions.select("YearsExperience", "prediction").show()

+---------------+------------------+
|YearsExperience|        prediction|
+---------------+------------------+
|            1.1|37033.710931304675|
|            2.2| 47292.89809276757|
|            3.7|61282.698767489695|
|            3.9|63148.005524119304|
|            4.0| 64080.65890243412|
|            5.3| 76205.15282052662|
|            5.9| 81801.07309041548|
+---------------+------------------+



Calculating The R2

In [320]:
from pyspark.ml.evaluation import RegressionEvaluator

In [321]:
evaluator_r2 = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared value:", r2)

R-squared value: 0.9051905534419166
