In [1]:
# Analyzing Suicide Trends in the Philippines using PySpark (1979-2016) 

In [2]:
# Import libraries
from pyspark.sql import SparkSession
import csv
from io import StringIO
from pyspark.sql.functions import col
from pyspark.sql.functions import sum, count

In [3]:
# Create a SparkSession
spark = SparkSession.builder.appName("Philippine_Suicide_Data_Analysis").getOrCreate()

# Create a SparkContext
sc = spark.sparkContext

In [4]:
# RDD Data Manipulation
rdd = sc.textFile("who_suicide_statistics.csv")
print(f"first elements of the rdd: {rdd.first()}")
print(f"number of elements of the rdd: {rdd.count()}")

first elements of the rdd: country,year,sex,age,suicides_no,population
number of elements of the rdd: 43777


In [5]:
# Remove header
header = rdd.first() 
data_rdd = rdd.filter(lambda row: row != header)
print(f"first elements of the rdd after removing the header: {data_rdd.first()}")
print(f"number of elements of the rdd after removing the header: {data_rdd.count()}")

first elements of the rdd after removing the header: Albania,1985,female,15-24 years,,277900
number of elements of the rdd after removing the header: 43776


In [6]:
# Helper function to parse a CSV row.
def parse_csv(row):
    reader = csv.DictReader(StringIO(row), fieldnames=header.split(','))
    return next(reader)

# Parse each row into a dictionary
parsed_rdd = data_rdd.map(parse_csv)

# Filter the dataset for rows where the country is 'Philippines'
philippines_rdd = parsed_rdd.filter(lambda row: row['country'] == 'Philippines')

In [7]:
# Using collect() method
print(philippines_rdd.collect())

[{'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '15-24 years', 'suicides_no': '', 'population': '5015000'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '25-34 years', 'suicides_no': '', 'population': '3417900'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '35-54 years', 'suicides_no': '', 'population': '3865100'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '5-14 years', 'suicides_no': '', 'population': '6017800'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '55-74 years', 'suicides_no': '', 'population': '1732900'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '75+ years', 'suicides_no': '', 'population': '264600'}, {'country': 'Philippines', 'year': '1980', 'sex': 'male', 'age': '15-24 years', 'suicides_no': '', 'population': '4907900'}, {'country': 'Philippines', 'year': '1980', 'sex': 'male', 'age': '25-34 years', 'suicides_no': '', 'population': '31

In [8]:
# Using take() method
print(philippines_rdd.take(10))

[{'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '15-24 years', 'suicides_no': '', 'population': '5015000'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '25-34 years', 'suicides_no': '', 'population': '3417900'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '35-54 years', 'suicides_no': '', 'population': '3865100'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '5-14 years', 'suicides_no': '', 'population': '6017800'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '55-74 years', 'suicides_no': '', 'population': '1732900'}, {'country': 'Philippines', 'year': '1980', 'sex': 'female', 'age': '75+ years', 'suicides_no': '', 'population': '264600'}, {'country': 'Philippines', 'year': '1980', 'sex': 'male', 'age': '15-24 years', 'suicides_no': '', 'population': '4907900'}, {'country': 'Philippines', 'year': '1980', 'sex': 'male', 'age': '25-34 years', 'suicides_no': '', 'population': '31

In [9]:
# Empty suicides number columns
empty_suicides_no_rdd = philippines_rdd.filter(lambda row: row['suicides_no'] == '')
print(f"empty suicide number columns count: {empty_suicides_no_rdd.count()}")

empty suicide number columns count: 96


In [10]:
# Quantify empty suicides number columns
def replace_empty_with_zero(row):
  if row['suicides_no'] is None:
    return {'suicides_no': 0}
  else:
    return row

philippines_rdd = philippines_rdd.filter(lambda row: row['suicides_no'] != '') \
  .map(replace_empty_with_zero)

empty_suicides_no_rdd = philippines_rdd.filter(lambda row: row['suicides_no'] == '')
print(f"empty suicide number columns count: {empty_suicides_no_rdd.count()}")

empty suicide number columns count: 0


In [11]:
# Group By

grouped_by_age_rdd = philippines_rdd.groupBy(lambda row: row['age'])
grouped_by_age_rdd = grouped_by_age_rdd.map(lambda x: (x[0], list(x[1])))
flattened_rdd = grouped_by_age_rdd.flatMap(lambda x: [(x[0], entry) for entry in x[1]])

# Compute suicide rates
suicide_rates_rdd = flattened_rdd.map(lambda x: (
    x[0],  # age group
    float(x[1]['suicides_no']) / float(x[1]['population']) * 100000  # suicide rate per 100,000
))

# Aggregate by age group, calculate average suicide rate per age group 
# using formula: Suicide rate = Total Number of suicides per Age Group / Total Population per Age Group) × 100,000

aggregated_rdd = suicide_rates_rdd.combineByKey(
    lambda value: (value, 1),
    lambda acc, value: (acc[0] + value, acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
).map(lambda x: (x[0], x[1][0] / x[1][1]))

# Collect and print results
results = aggregated_rdd.collect()
for result in results:
    print(f"Age Group: {result[0]}, Average Suicide Rate: {result[1]:.2f}%")

Age Group: 15-24 years, Average Suicide Rate: 2.84%
Age Group: 5-14 years, Average Suicide Rate: 0.16%
Age Group: 55-74 years, Average Suicide Rate: 2.46%
Age Group: 75+ years, Average Suicide Rate: 4.04%
Age Group: 25-34 years, Average Suicide Rate: 2.78%
Age Group: 35-54 years, Average Suicide Rate: 2.24%


In [12]:
# Sort by suicide rate in descending order
sorted_rdd = aggregated_rdd.sortBy(lambda x: x[1], ascending=False)

# Collect and print results
results = sorted_rdd.collect()
for result in results:
    print(f"Age Group: {result[0]}, Average Suicide Rate: {result[1]:.2f}%")

Age Group: 75+ years, Average Suicide Rate: 4.04%
Age Group: 15-24 years, Average Suicide Rate: 2.84%
Age Group: 25-34 years, Average Suicide Rate: 2.78%
Age Group: 55-74 years, Average Suicide Rate: 2.46%
Age Group: 35-54 years, Average Suicide Rate: 2.24%
Age Group: 5-14 years, Average Suicide Rate: 0.16%


In [13]:
# Dataframe Data Manipulation

df = spark.read.csv("who_suicide_statistics.csv", header=True, inferSchema=True)

# Show the schema of the DataFrame
df.printSchema()

# Show the first few rows of the DataFrame
print("Before dropping nulls:")
df.show()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)

Before dropping nulls:
+-------+----+------+-----------+-----------+----------+
|country|year|   sex|        age|suicides_no|population|
+-------+----+------+-----------+-----------+----------+
|Albania|1985|female|15-24 years|       NULL|    277900|
|Albania|1985|female|25-34 years|       NULL|    246800|
|Albania|1985|female|35-54 years|       NULL|    267500|
|Albania|1985|female| 5-14 years|       NULL|    298300|
|Albania|1985|female|55-74 years|       NULL|    138700|
|Albania|1985|female|  75+ years|       NULL|     34200|
|Albania|1985|  male|15-24 years|       NULL|    301400|
|Albania|1985|  male|25-34 years|       NULL|    264200|
|Albania|1985|  male|35-54 years|       NULL|    296700|
|Albania|1985|  male| 5-14 years|       NULL|    32580

In [14]:
# Drop rows where any of the specified columns have null values
columns_to_check = df.columns
cleaned_df = df.dropna(subset=columns_to_check)

# Show the first few rows of the DataFrame after dropping nulls
print("After dropping nulls:")
cleaned_df.show()

After dropping nulls:
+-------+----+------+-----------+-----------+----------+
|country|year|   sex|        age|suicides_no|population|
+-------+----+------+-----------+-----------+----------+
|Albania|1987|female|15-24 years|         14|    289700|
|Albania|1987|female|25-34 years|          4|    257200|
|Albania|1987|female|35-54 years|          6|    278800|
|Albania|1987|female| 5-14 years|          0|    311000|
|Albania|1987|female|55-74 years|          0|    144600|
|Albania|1987|female|  75+ years|          1|     35600|
|Albania|1987|  male|15-24 years|         21|    312900|
|Albania|1987|  male|25-34 years|          9|    274300|
|Albania|1987|  male|35-54 years|         16|    308000|
|Albania|1987|  male| 5-14 years|          0|    338200|
|Albania|1987|  male|55-74 years|          1|    137500|
|Albania|1987|  male|  75+ years|          1|     21800|
|Albania|1988|female|15-24 years|          8|    295600|
|Albania|1988|female|25-34 years|          5|    262400|
|Albania|

In [15]:
# Show the first few rows of the DataFrame before filtering
print("Before filtering:")
cleaned_df.show()

Before filtering:
+-------+----+------+-----------+-----------+----------+
|country|year|   sex|        age|suicides_no|population|
+-------+----+------+-----------+-----------+----------+
|Albania|1987|female|15-24 years|         14|    289700|
|Albania|1987|female|25-34 years|          4|    257200|
|Albania|1987|female|35-54 years|          6|    278800|
|Albania|1987|female| 5-14 years|          0|    311000|
|Albania|1987|female|55-74 years|          0|    144600|
|Albania|1987|female|  75+ years|          1|     35600|
|Albania|1987|  male|15-24 years|         21|    312900|
|Albania|1987|  male|25-34 years|          9|    274300|
|Albania|1987|  male|35-54 years|         16|    308000|
|Albania|1987|  male| 5-14 years|          0|    338200|
|Albania|1987|  male|55-74 years|          1|    137500|
|Albania|1987|  male|  75+ years|          1|     21800|
|Albania|1988|female|15-24 years|          8|    295600|
|Albania|1988|female|25-34 years|          5|    262400|
|Albania|1988

In [16]:
# Filter
# Filter the DataFrame to only include rows where the country is "Philippines"
filtered_df = cleaned_df.dropDuplicates().filter(df['country'] == 'Philippines')

# Show the first few rows of the filtered DataFrame
print("After filtering for Philippines:")
filtered_df.show()

After filtering for Philippines:
+-----------+----+------+-----------+-----------+----------+
|    country|year|   sex|        age|suicides_no|population|
+-----------+----+------+-----------+-----------+----------+
|Philippines|1993|female|  75+ years|         10|    395945|
|Philippines|2006|female| 5-14 years|         28|  10121349|
|Philippines|2002|female|15-24 years|        114|   7903007|
|Philippines|1992|female|  75+ years|          2|    385751|
|Philippines|2008|  male|55-74 years|        144|   3349085|
|Philippines|1993|female|35-54 years|         54|   5754827|
|Philippines|1993|  male| 5-14 years|          0|   8673303|
|Philippines|1999|  male|25-34 years|        199|   5840463|
|Philippines|2009|  male|25-34 years|        383|   7144648|
|Philippines|2002|  male|25-34 years|        267|   6171400|
|Philippines|2000|  male|35-54 years|        294|   7449254|
|Philippines|2002|  male|55-74 years|         93|   2525030|
|Philippines|2010|  male|25-34 years|        400|   

In [17]:
#Group By

# Cast the 'suicides_no' column to integers, ignoring non-numeric values
filtered_df = filtered_df.withColumn('suicides_no', col('suicides_no').cast('integer'))

# Group by the 'age' column and sum the 'suicides_no' column in each group
grouped_by_age_df = filtered_df.groupBy('age').agg(sum('suicides_no').alias('total_suicides_no'))

# Show the grouped DataFrame
grouped_by_age_df.show()

+-----------+-----------------+
|        age|total_suicides_no|
+-----------+-----------------+
|55-74 years|             2168|
|25-34 years|             5358|
| 5-14 years|              493|
|  75+ years|              488|
|15-24 years|             7107|
|35-54 years|             5716|
+-----------+-----------------+



In [18]:
grouped_by_age_df = grouped_by_age_df.orderBy("total_suicides_no", ascending=False)
grouped_by_age_df.show()

+-----------+-----------------+
|        age|total_suicides_no|
+-----------+-----------------+
|15-24 years|             7107|
|35-54 years|             5716|
|25-34 years|             5358|
|55-74 years|             2168|
| 5-14 years|              493|
|  75+ years|              488|
+-----------+-----------------+



In [19]:
# grouped_by_age_df = grouped_by_age_df.withColumnRenamed("suicides_no", "total_suicides_no")

In [20]:
spark.stop()