# Task 1: Big Data Analysis using PySpark

In [1]:
!pip install pyspark



In [2]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year

# Start Spark Session
spark = SparkSession.builder.appName("PeopleBigData").getOrCreate()

In [3]:

from google.colab import files
uploaded = files.upload()  # Upload people-100.csv manually

Saving people-100.csv to people-100.csv


In [4]:

# Load CSV file
df = spark.read.csv("people-100.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- Index: integer (nullable = true)
 |-- User Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Date of birth: date (nullable = true)
 |-- Job Title: string (nullable = true)

+-----+---------------+----------+---------+------+--------------------+--------------------+-------------+------------------+
|Index|        User Id|First Name|Last Name|   Sex|               Email|               Phone|Date of birth|         Job Title|
+-----+---------------+----------+---------+------+--------------------+--------------------+-------------+------------------+
|    1|88F7B33d2bcf9f5|    Shelby|  Terrell|  Male|elijah57@example.net|001-084-906-7849x...|   1945-10-26|   Games developer|
|    2|f90cD3E76f1A9b9|   Phillip|  Summers|Female|bethany14@example...|   214.112.6044x4913|   1910-03-24|    Phytotherapist|
|    3

In [5]:

# Group by Sex
df.groupBy("Sex").count().show()

+------+-----+
|   Sex|count|
+------+-----+
|Female|   53|
|  Male|   47|
+------+-----+



In [6]:

# Group by Job Title
df.groupBy("Job Title").count().orderBy("count", ascending=False).show()

+--------------------+-----+
|           Job Title|count|
+--------------------+-----+
|Biochemist, clinical|    3|
|Accountant, chart...|    2|
|Scientist, clinic...|    2|
|  Jewellery designer|    2|
|Outdoor activitie...|    2|
|       Retail banker|    2|
| Seismic interpreter|    2|
|Research scientis...|    2|
|English as a seco...|    1|
|  Petroleum engineer|    1|
|Conservator, furn...|    1|
|          Counsellor|    1|
|      Hydrogeologist|    1|
|      Police officer|    1|
|IT sales professi...|    1|
|Education officer...|    1|
|Accounting techni...|    1|
|   Recycling officer|    1|
|     Games developer|    1|
|                Make|    1|
+--------------------+-----+
only showing top 20 rows



In [7]:

# Extract Birth Year from Date of Birth
df = df.withColumn("Birth_Year", year("Date of birth"))
df.groupBy("Birth_Year").count().orderBy("Birth_Year").show()

+----------+-----+
|Birth_Year|count|
+----------+-----+
|      1908|    3|
|      1909|    1|
|      1910|    2|
|      1911|    2|
|      1915|    1|
|      1916|    1|
|      1921|    1|
|      1924|    2|
|      1925|    1|
|      1926|    1|
|      1927|    1|
|      1930|    3|
|      1931|    2|
|      1932|    2|
|      1933|    1|
|      1938|    3|
|      1941|    1|
|      1942|    1|
|      1943|    1|
|      1944|    2|
+----------+-----+
only showing top 20 rows

