In [1]:
# Installing pyspark

!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=e9f1a547dd14e54e90181db44a203249cfa7e2633f366f81f523aa8eb3dcc4ae
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
# Import necessary libraries

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType

In [3]:
# Initialise spark session

spark = SparkSession.builder.getOrCreate()

In [4]:
# Load the dataset

data = spark.read.csv('/content/weatherAUS.csv', header=True)

In [5]:
# Display data

data.show()

+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|      Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow|
+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|2008-12-01|  Albury|   13.4|   22.9|     0.6|         NA|      NA|          W|           44|         W|       WNW|          20|          24|         71|         22|     1007.7|     1007.1|       8|      NA|   16.9|   21.8|       No|  

In [6]:
# Display name and data type of each column

data.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- MinTemp: string (nullable = true)
 |-- MaxTemp: string (nullable = true)
 |-- Rainfall: string (nullable = true)
 |-- Evaporation: string (nullable = true)
 |-- Sunshine: string (nullable = true)
 |-- WindGustDir: string (nullable = true)
 |-- WindGustSpeed: string (nullable = true)
 |-- WindDir9am: string (nullable = true)
 |-- WindDir3pm: string (nullable = true)
 |-- WindSpeed9am: string (nullable = true)
 |-- WindSpeed3pm: string (nullable = true)
 |-- Humidity9am: string (nullable = true)
 |-- Humidity3pm: string (nullable = true)
 |-- Pressure9am: string (nullable = true)
 |-- Pressure3pm: string (nullable = true)
 |-- Cloud9am: string (nullable = true)
 |-- Cloud3pm: string (nullable = true)
 |-- Temp9am: string (nullable = true)
 |-- Temp3pm: string (nullable = true)
 |-- RainToday: string (nullable = true)
 |-- RISK_MM: string (nullable = true)
 |-- RainTomorrow: string (nullable = true)



In [7]:
# Change the data type of columns since pyspark is considering all columns as strings

schema = StructType([
    StructField("Date", DateType()),
    StructField("Location", StringType(), True),
    StructField("MinTemp", DoubleType(), True),
    StructField("MaxTemp", DoubleType(), True),
    StructField("Rainfall", DoubleType(), True),
    StructField("Evaporation", DoubleType(), True),
    StructField("Sunshine", DoubleType(), True),
    StructField("WindGustDir", StringType(), True),
    StructField("WindGustSpeed", DoubleType(), True),
    StructField("WindDir9am", StringType(), True),
    StructField("WindDir3pm", StringType(), True),
    StructField("WindSpeed9am", DoubleType(), True),
    StructField("WindSpeed3pm", DoubleType(), True),
    StructField("Humidity9am", DoubleType(), True),
    StructField("Humidity3pm", DoubleType(), True),
    StructField("Pressure9am", DoubleType(), True),
    StructField("Pressure3pm", DoubleType(), True),
    StructField("Cloud9am", DoubleType(), True),
    StructField("Cloud3pm", DoubleType(), True),
    StructField("Temp9am", DoubleType(), True),
    StructField("Temp3pm", DoubleType(), True),
    StructField("RainToday", StringType(), True),
    StructField("RISK_MM", DoubleType(), True),
    StructField("RainTomorrow", StringType(), True)
])

In [8]:
# Read the data

data = spark.read.option("header", "true").schema(schema).csv("/content/weatherAUS.csv")

In [9]:
# Check the data type of each column

data.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- MinTemp: double (nullable = true)
 |-- MaxTemp: double (nullable = true)
 |-- Rainfall: double (nullable = true)
 |-- Evaporation: double (nullable = true)
 |-- Sunshine: double (nullable = true)
 |-- WindGustDir: string (nullable = true)
 |-- WindGustSpeed: double (nullable = true)
 |-- WindDir9am: string (nullable = true)
 |-- WindDir3pm: string (nullable = true)
 |-- WindSpeed9am: double (nullable = true)
 |-- WindSpeed3pm: double (nullable = true)
 |-- Humidity9am: double (nullable = true)
 |-- Humidity3pm: double (nullable = true)
 |-- Pressure9am: double (nullable = true)
 |-- Pressure3pm: double (nullable = true)
 |-- Cloud9am: double (nullable = true)
 |-- Cloud3pm: double (nullable = true)
 |-- Temp9am: double (nullable = true)
 |-- Temp3pm: double (nullable = true)
 |-- RainToday: string (nullable = true)
 |-- RISK_MM: double (nullable = true)
 |-- RainTomorrow: string (nullable = true)



In [10]:
# Display data

data.show()

+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|      Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow|
+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|2008-12-01|  Albury|   13.4|   22.9|     0.6|       NULL|    NULL|          W|         44.0|         W|       WNW|        20.0|        24.0|       71.0|       22.0|     1007.7|     1007.1|     8.0|    NULL|   16.9|   21.8|       No|  

In [11]:
# Number of rows and columns in the dataset

print("Number of rows and columns in the dataset:\n")
print(data.count(),len(data.columns))

Number of rows and columns in the dataset:

142193 24


**1. Explore the dataset to find what are the categorical variables in the dataset. List
categorical variable names.**

In [12]:
# Print Categorical variable names

categorical_variables = [col_name for col_name, data_type in data.dtypes if data_type == "string"]
print("categorical_variables:\n\n",categorical_variables)

categorical_variables:

 ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


**2. Explore how many categorical variables have null values.**

In [13]:
# Number of null values in each categorical variables

categorical_variables_null_values = {col: data.filter(data[col] == "NA").count() for col in categorical_variables}
print("Null values in categorical variables:\n")
print(categorical_variables_null_values)

Null values in categorical variables:

{'Location': 0, 'WindGustDir': 9330, 'WindDir9am': 10013, 'WindDir3pm': 3778, 'RainToday': 1406, 'RainTomorrow': 0}


**3. Get the frequency count of each categorical variable. For instance, how many discrete
values in each categorical variable and how many datapoints from each distinct value.**

In [14]:
# Number of distinct values and Frequency count of each categorical variable

for col_name in categorical_variables:
    print(f"Number of distinct values in {col_name}: {data.select(col_name).distinct().count()}")
    print(f"Frequency count for {col_name}:")
    data.groupBy(col_name).count().show()

Number of distinct values in Location: 49
Frequency count for Location:
+----------------+-----+
|        Location|count|
+----------------+-----+
|          Cairns| 2988|
|   NorfolkIsland| 2964|
|         Bendigo| 3034|
|        Canberra| 3418|
|           Cobar| 2988|
|   SydneyAirport| 3005|
|      Wollongong| 2983|
|     Williamtown| 2553|
|           Moree| 2854|
|         Mildura| 3007|
|        Portland| 2996|
|        Brisbane| 3161|
|          Sydney| 3337|
|            Sale| 3000|
|   BadgerysCreek| 2928|
|     Tuggeranong| 2998|
|        Ballarat| 3028|
|       GoldCoast| 2980|
|MelbourneAirport| 3009|
|        Dartmoor| 2943|
+----------------+-----+
only showing top 20 rows

Number of distinct values in WindGustDir: 17
Frequency count for WindGustDir:
+-----------+-----+
|WindGustDir|count|
+-----------+-----+
|        SSE| 8993|
|         SW| 8797|
|         NW| 8003|
|         NA| 9330|
|          E| 9071|
|        WSW| 8901|
|        ENE| 7992|
|         NE| 7060|
|   

**4. Print the first five rows of the dataset**

In [15]:
# First five rows of the dataset

print("First five rows of the dataset:\n")
data.show(5)

First five rows of the dataset:

+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|      Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow|
+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|2008-12-01|  Albury|   13.4|   22.9|     0.6|       NULL|    NULL|          W|         44.0|         W|       WNW|        20.0|        24.0|       71.0|       22.0|     1007.7|     1007.1|     8.0|    

**5. What are the available columns of the dataset?**

In [16]:
# Columns in dataset

print("Columns of the dataset:\n\n", data.columns)

Columns of the dataset:

 ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow']


**6. Drop RISK_MM variable column**

In [17]:
# Drop RISK_MM column
data=data.drop("RISK_MM")

# Print columns in dataset
print("Columns of the dataset after droping RISK_MM column:\n")
print(data.columns)

Columns of the dataset after droping RISK_MM column:

['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']


**7. Get the summary of the dataset.**

In [18]:
# Display dataset summary

print("Summary:\n")
data.summary().show()

Summary:

+-------+--------+------------------+-----------------+-----------------+------------------+------------------+-----------+------------------+----------+----------+-----------------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+-----------------+---------+------------+
|summary|Location|           MinTemp|          MaxTemp|         Rainfall|       Evaporation|          Sunshine|WindGustDir|     WindGustSpeed|WindDir9am|WindDir3pm|     WindSpeed9am|     WindSpeed3pm|       Humidity9am|       Humidity3pm|       Pressure9am|      Pressure3pm|          Cloud9am|         Cloud3pm|          Temp9am|          Temp3pm|RainToday|RainTomorrow|
+-------+--------+------------------+-----------------+-----------------+------------------+------------------+-----------+------------------+----------+----------+-----------------+-----------------+------------------+------------------+--------

**8. List first five rows only for categorical variables.**

In [19]:
# First five rows of categorical variables

print("First five rows for categorical variables:\n")
data.select(categorical_variables).show(5)

First five rows for categorical variables:

+--------+-----------+----------+----------+---------+------------+
|Location|WindGustDir|WindDir9am|WindDir3pm|RainToday|RainTomorrow|
+--------+-----------+----------+----------+---------+------------+
|  Albury|          W|         W|       WNW|       No|          No|
|  Albury|        WNW|       NNW|       WSW|       No|          No|
|  Albury|        WSW|         W|       WSW|       No|          No|
|  Albury|         NE|        SE|         E|       No|          No|
|  Albury|          W|       ENE|        NW|       No|          No|
+--------+-----------+----------+----------+---------+------------+
only showing top 5 rows



**9. Decompose the date field into year, month, and day fields. Then drop the original date
field.**

In [20]:
from pyspark.sql.functions import year, month, dayofmonth

# Decompose the Date field into Year, Month, and Day fields
data = data.withColumn('Year', year('Date')).withColumn('Month', month('Date')).withColumn('Day', dayofmonth('Date'))

# Drop the original Date field
data = data.drop('Date')
data.show(5)

+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+------------+----+-----+---+
|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RainTomorrow|Year|Month|Day|
+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+------------+----+-----+---+
|  Albury|   13.4|   22.9|     0.6|       NULL|    NULL|          W|         44.0|         W|       WNW|        20.0|        24.0|       71.0|       22.0|     1007.7|     1007.1|     8.0|    NULL|   16.9|   21.8|       No|          No|2008|   12| 

**10. How many unique locations are there in the dataset?**

In [21]:
# Number of unique locations in the dataset

print("Number of Unique Locations:\n")
data.select(col('Location')).distinct().count()

Number of Unique Locations:



49

**11. Print the number of times each unique location appears in the dataset.**

In [22]:
# Number of times each unique location appears in the dataset

print("Count of each Unique Location:\n")
data.groupBy('Location').count().show()

Count of each Unique Location:

+----------------+-----+
|        Location|count|
+----------------+-----+
|          Cairns| 2988|
|   NorfolkIsland| 2964|
|         Bendigo| 3034|
|        Canberra| 3418|
|           Cobar| 2988|
|   SydneyAirport| 3005|
|      Wollongong| 2983|
|     Williamtown| 2553|
|           Moree| 2854|
|         Mildura| 3007|
|        Portland| 2996|
|        Brisbane| 3161|
|          Sydney| 3337|
|            Sale| 3000|
|   BadgerysCreek| 2928|
|     Tuggeranong| 2998|
|        Ballarat| 3028|
|       GoldCoast| 2980|
|MelbourneAirport| 3009|
|        Dartmoor| 2943|
+----------------+-----+
only showing top 20 rows



**12. Perform One Hot Encoding for each categorical variable.**

In [23]:
# Index Categorical variables
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep") for col in categorical_variables]
indexer_pipeline = Pipeline(stages=indexers)
indexed_df = indexer_pipeline.fit(data).transform(data)

In [24]:
# One Hot Encoding for indexed categorical variables
encoder = OneHotEncoder(inputCols=[col+"_index" for col in categorical_variables],
                        outputCols=[col+"_encoded" for col in categorical_variables])
encoded_data = encoder.fit(indexed_df).transform(indexed_df)

In [25]:
# Display encoded data

print("Encoded data\n")
encoded_data.show(5)

Encoded data

+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+------------+----+-----+---+--------------+-----------------+----------------+----------------+---------------+------------------+----------------+-------------------+------------------+------------------+-----------------+--------------------+
|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RainTomorrow|Year|Month|Day|Location_index|WindGustDir_index|WindDir9am_index|WindDir3pm_index|RainToday_index|RainTomorrow_index|Location_encoded|WindGustDir_encoded|WindDir9am_encoded|WindDir3pm_encoded|RainToday_encoded|RainTomorrow_encoded|
+--------+-------+-------+--------+-----------+-----

**13. The RainTomorrow is the label, and all other fields are features.**

In [26]:
# RainTomorrow is label and all other fields are features

label_col = "RainTomorrow"
feature_cols = [col for col in encoded_data.columns if col != label_col]

**14. You may use the median value (median imputation) of each field to fill null values if it is a
numerical column. Use the most frequent value in case of a categorical column**

In [27]:
# Import necessary libraries

from pyspark.ml.feature import MinMaxScaler, Imputer

In [28]:
# Fill null values with most frequent for categorical columns

for col in categorical_variables:
    mode = encoded_data.groupBy(col).count().orderBy("count", ascending=False).first()[0]
    encoded_data = encoded_data.fillna({col: mode})

In [29]:
# Impute null values for numerical columns with median

numerical_cols = [col for col, dtype in data.dtypes if dtype != 'string']

imputer = Imputer(strategy='median', inputCols=numerical_cols, outputCols=numerical_cols)
imputed_data = imputer.fit(encoded_data).transform(encoded_data)

In [30]:
# Display data after replacing null values with most frequent for categorical columns and with median for numerical columns

print("Data after replacing null values with most frequent for categorical columns and with median for numerical columns\n")
imputed_data.show(5)

Data after replacing null values with most frequent for categorical columns and with median for numerical columns

+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+------------+----+-----+---+--------------+-----------------+----------------+----------------+---------------+------------------+----------------+-------------------+------------------+------------------+-----------------+--------------------+
|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RainTomorrow|Year|Month|Day|Location_index|WindGustDir_index|WindDir9am_index|WindDir3pm_index|RainToday_index|RainTomorrow_index|Location_encoded|WindGustDir_encoded|WindDir9am_encoded|WindDir3pm

**15. Normalize each numerical column (bring it to a value between 0 and 1)**

In [31]:
# Normalize numerical columns

for col in numerical_cols:
    if col != label_col:  # Exclude the label column
        minimum = imputed_data.agg({col: "min"}).collect()[0][0]
        maximum = imputed_data.agg({col: "max"}).collect()[0][0]
        imputed_data = imputed_data.withColumn(col, (imputed_data[col] - minimum) / (maximum - minimum))

In [32]:
# Display data after normalising each numerical column

print("Data after normalising each numerical column\n")
imputed_data.show(5)

Data after normalising each numerical column

+--------+-------------------+------------------+--------------------+-------------------+------------------+-----------+-------------------+----------+----------+-------------------+-------------------+-----------+-----------+-------------------+-------------------+------------------+------------------+------------------+------------------+---------+------------+----+-----+-------------------+--------------+-----------------+----------------+----------------+---------------+------------------+----------------+-------------------+------------------+------------------+-----------------+--------------------+
|Location|            MinTemp|           MaxTemp|            Rainfall|        Evaporation|          Sunshine|WindGustDir|      WindGustSpeed|WindDir9am|WindDir3pm|       WindSpeed9am|       WindSpeed3pm|Humidity9am|Humidity3pm|        Pressure9am|        Pressure3pm|          Cloud9am|          Cloud3pm|           Temp9am|           Temp3

**16. Train your Logistic Regression model on the training dataset (70% 30% split)**

In [36]:
# Encode label column

label_indexer = StringIndexer(inputCol=label_col, outputCol="label")
indexed_df = label_indexer.fit(imputed_data).transform(imputed_data)

In [37]:
# Assemble feature columns

feature_cols = [col for col in indexed_df.columns if col not in categorical_variables + [label_col]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_data = assembler.transform(indexed_df)

In [38]:
assembled_data.show(5)

+--------+-------------------+------------------+--------------------+-------------------+------------------+-----------+-------------------+----------+----------+-------------------+-------------------+-----------+-----------+-------------------+-------------------+------------------+------------------+------------------+------------------+---------+------------+----+-----+-------------------+--------------+-----------------+----------------+----------------+---------------+------------------+----------------+-------------------+------------------+------------------+-----------------+--------------------+-----+--------------------+
|Location|            MinTemp|           MaxTemp|            Rainfall|        Evaporation|          Sunshine|WindGustDir|      WindGustSpeed|WindDir9am|WindDir3pm|       WindSpeed9am|       WindSpeed3pm|Humidity9am|Humidity3pm|        Pressure9am|        Pressure3pm|          Cloud9am|          Cloud3pm|           Temp9am|           Temp3pm|RainToday|RainTo

In [39]:
# Split the dataset into training and testing sets (70% 30% split)

train_data, test_data = assembled_data.randomSplit([0.7, 0.3], seed=42)

In [40]:
# Train Logistic Regression model

lr = LogisticRegression(featuresCol="features", labelCol="label")

In [41]:
lr_model = lr.fit(train_data)

**17. Predict the RainTomorrow for the test set.**

In [42]:
# Predict RainTomorrow for the test set
predictions = lr_model.transform(test_data)

# Show predicted values for RainTomorrow
predictions.select("RainTomorrow", "prediction").show()

+------------+----------+
|RainTomorrow|prediction|
+------------+----------+
|         Yes|       1.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
|          No|       0.0|
+------------+----------+
only showing top 20 rows



**18. Describe the performance of your model using the confusion matrix and print TP, TN,
FP, and FN. In addition, provide the accuracy and F1 score of your model.**

In [43]:
# Evaluate the model

evaluator = BinaryClassificationEvaluator(labelCol="label")
accuracy = evaluator.evaluate(predictions)

In [44]:
# Confusion Matrix

print("Confusion Matrix:\n")
predictions.groupBy("label", "prediction").count().show()

Confusion Matrix:

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 9539|
|  0.0|       0.0|33066|
+-----+----------+-----+



In [45]:
# True Positives (TP)
tp = predictions.filter((predictions["label"] == 1) & (predictions["prediction"] == 1)).count()

# True Negatives (TN)
tn = predictions.filter((predictions["label"] == 0) & (predictions["prediction"] == 0)).count()

# False Positives (FP)
fp = predictions.filter((predictions["label"] == 0) & (predictions["prediction"] == 1)).count()

# False Negatives (FN)
fn = predictions.filter((predictions["label"] == 1) & (predictions["prediction"] == 0)).count()

In [46]:
# Precision, Recall and f1_score

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

In [47]:
# Print True Positiives, True Negatives, False Positives, False Negatives

print("True Positives:", tp)
print("\nTrue Negatives:", tn)
print("\nFalse Positives:", fp)
print("\nFalse Negatives:", fn)

True Positives: 9539

True Negatives: 33066

False Positives: 0

False Negatives: 0


In [48]:
# Print Accuracy and F1 Score

print("Accuracy:", accuracy)
print("\nF1 Score:", f1_score)

Accuracy: 0.9999997067370341

F1 Score: 1.0
