In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, when

# Create a Spark session
spark = SparkSession.builder \
    .appName("DiabetesAnalysis") \
    .getOrCreate()

# Load CSV data into a DataFrame
file_path = "diabetes.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema of the DataFrame
df.printSchema()

# Update rows with BMI=0 with mean BMI and write into a new dataframe: pyspark_df_nonzero_BMI
mean_bmi = df.select(mean('BMI')).collect()[0][0]
df_nonzero_BMI = df.withColumn('BMI', when(df['BMI'] == 0, mean_bmi).otherwise(df['BMI']))

# Create a new dataframe pyspark_df_outcome that shows rows with age >= 35
df_outcome = df.filter(df['Age'] >= 35)

# Only show the rows where Diabetes Pedigree Function value is greater than equal to 0.51
df_diabetes_pedigree = df.filter(df['DiabetesPedigreeFunction'] >= 0.51)

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [7]:

# Display the resulting DataFrames
print("DataFrame with updated BMI=0 rows with mean BMI:")
df_nonzero_BMI.show()

print("DataFrame with rows where Age >= 35:")
df_outcome.show()




DataFrame with updated BMI=0 rows with mean BMI:
+-----------+-------+-------------+-------------+-------+------------------+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|               BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+------------------+------------------------+---+-------+
|          6|    148|           72|           35|      0|              33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|              26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|              23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|              28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|              43.1|                   2.288| 33|      1|
|          5|    116|           74|    

In [8]:
print("DataFrame with rows where Diabetes Pedigree Function >= 0.51:")
df_diabetes_pedigree.show()

# Stop the Spark session
spark.stop()

DataFrame with rows where Diabetes Pedigree Function >= 0.51:
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|         10|    168|           74|            0|      0|38.0|                   0.537| 34|      1|
|         10|    139|           80|            0|      0|27.1|                   1.441| 57|      0|
|          5|    166|           72|           19|    175|25.8|                   0.587| 51|      1|
|          0|    118|           84|   

In [15]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import multiprocessing

# Function to check if a number is prime
def is_prime(num):
    if num <= 1:
        return False
    if num <= 3:
        return True
    if num % 2 == 0 or num % 3 == 0:
        return False
    i = 5
    while i * i <= num:
        if num % i == 0 or num % (i + 2) == 0:
            return False
        i += 6
    return True

# Function to find the highest prime number in an array
def find_highest_prime(arr):
    highest_prime = -1
    for num in arr:
        if is_prime(num) and num > highest_prime:
            highest_prime = num
    return highest_prime

# Create a Spark session
spark = SparkSession.builder \
    .appName("HighestPrimeNumber") \
    .getOrCreate()

# Define the number of threads
num_threads = 4

# Create an array of N integers
N = 121000
arr = range(N)

# Split the array into chunks for parallel processing
chunk_size = len(arr) // num_threads
chunks = [arr[i:i + chunk_size] for i in range(0, len(arr), chunk_size)]

# Parallelize the chunks using Spark
rdd = spark.sparkContext.parallelize(chunks, num_threads)

# Find the highest prime number in each chunk
highest_primes = rdd.map(find_highest_prime)

# Find the overall highest prime number
max_prime = highest_primes.reduce(lambda x, y: max(x, y))

# Print the max prime number
print("Max Prime Number:", max_prime)

# Stop the Spark session
spark.stop()


Max Prime Number: 120997
