In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, col, max, min, isnan,corr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Statistical Measures and Probability Analysis") \
    .getOrCreate()

# Load the dataset and remove rows with null values
data_path = "marks.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True).dropna()

# Show the first 5 rows of the DataFrame
df.show(5)

# Calculate mean, standard deviation, max, and min for specific columns
df.select(mean(col("Quiz1")).alias("Mean Quiz1"),
          stddev(col("Quiz1")).alias("Standard Deviation Quiz1"),
          max(col("Quiz1")).alias("Max Quiz1"),
          min(col("Quiz1")).alias("Min Quiz1")).show()

df.select(mean(col("Quiz2")).alias("Mean Quiz2"),
          stddev(col("Quiz2")).alias("Standard Deviation Quiz2"),
          max(col("Quiz2")).alias("Max Quiz2"),
          min(col("Quiz2")).alias("Min Quiz2")).show()

df.select(mean(col("Avg1")).alias("Mean Avg1"),
          stddev(col("Avg1")).alias("Standard Deviation Avg1"),
          max(col("Avg1")).alias("Max Avg1"),
          min(col("Avg1")).alias("Min Avg1")).show()

df.select(mean(col("Assig1")).alias("Mean Assig1"),
          stddev(col("Assig1")).alias("Standard Deviation Assig1"),
          max(col("Assig1")).alias("Max Assig1"),
          min(col("Assig1")).alias("Min Assig1")).show()

df.select(mean(col("Assig2")).alias("Mean Assig2"),
          stddev(col("Assig2")).alias("Standard Deviation Assig2"),
          max(col("Assig2")).alias("Max Assig2"),
          min(col("Assig2")).alias("Min Assig2")).show()

df.select(mean(col("Avg2")).alias("Mean Avg2"),
          stddev(col("Avg2")).alias("Standard Deviation Avg2"),
          max(col("Avg2")).alias("Max Avg2"),
          min(col("Avg2")).alias("Min Avg2")).show()

# Calculate correlation between two columns
df.select(corr("Quiz1", "Quiz2").alias("Correlation Quiz1 Quiz2")).show()

# Calculate covariance between two columns
covariance = df.stat.cov("Quiz1", "Quiz2")
print(f"Covariance Quiz1 Quiz2: {covariance}")


+---+---------------+-----+-----+----+------+------+----+---------+
|Sno|           Name|Quiz1|Quiz2|AVG1|Assig1|Assig2|AVG2| Total 75|
+---+---------------+-----+-----+----+------+------+----+---------+
|  1|  Ashir Mehfooz| 14.0| 14.0|14.0|  13.0|  13.0|13.0|    68.00|
|  2|    Atif Raftad|  4.0| 10.0| 7.0|   4.0|   5.0| 4.5|    41.50|
|  3|     Saiqa Aziz| 15.0| 11.0|13.0|  14.0|  13.0|13.5|    60.50|
|  8|   Ozair Minhas|  6.0|  5.0| 5.5|   4.0|   6.0| 5.0|    22.50|
|  9|Naveera Subhani|  5.0| 11.0| 8.0|   4.0|   5.0| 4.5|    46.50|
+---+---------------+-----+-----+----+------+------+----+---------+
only showing top 5 rows

+-----------------+------------------------+---------+---------+
|       Mean Quiz1|Standard Deviation Quiz1|Max Quiz1|Min Quiz1|
+-----------------+------------------------+---------+---------+
|8.208955223880597|       4.575600441798329|     15.0|      0.0|
+-----------------+------------------------+---------+---------+

+-----------------+------------------

In [None]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=f0a6568daf10ee7fc45bae9c58cfe3109b1ee4466716aaef309e9cfe09918728
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [34]:
# First, calculate the necessary probabilities: P(Quiz1), P(Quiz2), and P(Quiz2|Quiz1)
p_quiz1 = df.filter(df['Quiz1'] >= 10).count() / df.count()
p_quiz2_given_quiz1 = df.filter((df['Quiz1'] >= 10) & (df['Quiz2'] >= 10)).count() / df.filter(df['Quiz1'] >= 10).count()
p_quiz2 = df.filter(df['Quiz2'] >= 10).count() / df.count()

# Apply Bayes' Theorem: P(Quiz1|Quiz2) = (P(Quiz2|Quiz1) * P(Quiz1)) / P(Quiz2)
p_quiz1_given_quiz2 = (p_quiz2_given_quiz1 * p_quiz1) / p_quiz2
print(f"P(Quiz1|Quiz2): {p_quiz1_given_quiz2}")

# Stop Spark session
spark.stop()

P(Quiz1|Quiz2): 0.6022727272727273
