###Set PySpark and read the file

In [None]:
# Creat a spark object
from pyspark import SparkConf, SparkContext

if __name__ == "__main__":
    conf = SparkConf().setAppName("RDD_LifeExpectancy").setMaster("local")
    sc = SparkContext(conf=conf)

# Creat a RDD from csv file
rdd = sc.textFile("Life Expectancy Data.csv")
header = rdd.first()
data = rdd.filter(lambda row: row != header).map(lambda row: row.split(','))
columns = header.split(',')

# Display the type
print("Type:",type(rdd))

Type: <class 'pyspark.rdd.RDD'>


In [None]:
# Count action helps to count number of rows in an RDD
print("Number of rows:",rdd.count())

# First action displays the first row
print("\nHeader of RDD:\n",rdd.first())

# Take action displays the specified number of elements from an RDD
print("\nFirst 2 rows of RDD:\n",rdd.take(2))

# Top Action for fetching rows on sorted basis of length of row
print("\nSorted 4 rows of RDD:\n",rdd.top(4))

Number of rows: 2929

Header of RDD:
 Country,Year,Status,Life expectancy ,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles , BMI ,under-five deaths ,Polio,Total expenditure,Diphtheria , HIV/AIDS,GDP,Population, thinness  1-19 years, thinness 5-9 years,Income composition of resources,Schooling

First 2 rows of RDD:
 ['Country,Year,Status,Life expectancy ,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles , BMI ,under-five deaths ,Polio,Total expenditure,Diphtheria , HIV/AIDS,GDP,Population, thinness  1-19 years, thinness 5-9 years,Income composition of resources,Schooling', 'Afghanistan,2015,Developing,65,263,62,0.01,71.27962362,65,1154,19.1,83,6,8.16,65,0.1,584.25921,33736494,17.2,17.3,0.479,10.1']

Sorted 4 rows of RDD:
 ['Zimbabwe,2015,Developing,67,336,22,,0,87,0,31.8,32,88,,87,6.2,118.69383,15777451,5.6,5.5,0.507,10.3', 'Zimbabwe,2014,Developing,59.2,371,23,6.5,10.82259524,91,0,31.3,34,92,6.44,91,6.3,127.47462,15411675,

### Section A1:
**Query: Display the number of missing values for each column**

In [None]:
# Calculate missing values for each column
missing_counts = (
    data.flatMap(lambda row: [(columns[i], 1) for i, val in enumerate(row) if val.strip() == "" or val.strip().lower() == 'na'])
         .reduceByKey(lambda a, b: a + b)
         .collect()
)

# Print result
for col, count in missing_counts:
    print(f"Column: {col}, Missing Values: {count}")


Column: Alcohol, Missing Values: 193
Column: Total expenditure, Missing Values: 226
Column: Hepatitis B, Missing Values: 553
Column: Population, Missing Values: 644
Column: GDP, Missing Values: 443
Column: Income composition of resources, Missing Values: 160
Column: Schooling, Missing Values: 160
Column: Polio, Missing Values: 19
Column: Diphtheria , Missing Values: 19
Column: BMI , Missing Values: 32
Column: thinness 1-19 years, Missing Values: 32
Column: thinness 5-9 years, Missing Values: 32




---


### Section A2:
**Query: Display the total value of Adult Mortality for 2010 for developed countries**

In [None]:
# Filter for the year 2010 and developed countries
filtered_data = data.filter(lambda row: row[columns.index("Year")] == "2010" and row[columns.index("Status")] == "Developed")

# Extract and sum Adult Mortality values
# Instead of mapping to int directly, create a key-value pair with a dummy key
mortality = filtered_data.map(lambda row: (1, int(row[columns.index("Adult Mortality")])))

total_adult_mortality = mortality.reduceByKey(lambda x, y: x + y).collect()

# Extract the total mortality from the result (since we used a dummy key)
total_adult_mortality = total_adult_mortality[0][1] # Access the value from the (key, value) pair


# Print the result
print(f"Total Adult Mortality for Developed Countries in 2010: {total_adult_mortality}")

Total Adult Mortality for Developed Countries in 2010: 2385


In [None]:
sc.stop()