In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window


In [0]:
data = [
    (2012, "JOHN", "ONEIDA,ORANGE", "M", 30),
    (2012, "EMMA", "JEFFERSON", "F", 25),
    (2012, "NOAH", "JEFFERSON", "M", 18),
    (2012, "SOPHIA", "MADISON", "F", 12),
    (2012, "LIAM", "ONONDAGA", "M", 36),
    (2012, "OLIVIA", "CAYUGA", "F", 22),
    (2012, "MASON", "MADISON", "M", 28),
    (2012, "AVA", "JEFFERSON", "F", 20),
    (2012, "WILLIAM", "CAYUGA", "M", 45),
    (2012, "ABIGAIL", "MADISON", "F", 19)
]

In [0]:
schema = ["Year", "First Name", "County", "gender", "Count"]

In [0]:
df = spark.createDataFrame(data, schema=schema)
df.display()

Year,First Name,County,gender,Count
2012,JOHN,"ONEIDA,ORANGE",M,30
2012,EMMA,JEFFERSON,F,25
2012,NOAH,JEFFERSON,M,18
2012,SOPHIA,MADISON,F,12
2012,LIAM,ONONDAGA,M,36
2012,OLIVIA,CAYUGA,F,22
2012,MASON,MADISON,M,28
2012,AVA,JEFFERSON,F,20
2012,WILLIAM,CAYUGA,M,45
2012,ABIGAIL,MADISON,F,19


In [0]:
df_rdd=spark.sparkContext.parallelize(data)

In [0]:
df_rdd.collect()

Out[15]: [(2012, 'JOHN', 'ONEIDA,ORANGE', 'M', 30),
 (2012, 'EMMA', 'JEFFERSON', 'F', 25),
 (2012, 'NOAH', 'JEFFERSON', 'M', 18),
 (2012, 'SOPHIA', 'MADISON', 'F', 12),
 (2012, 'LIAM', 'ONONDAGA', 'M', 36),
 (2012, 'OLIVIA', 'CAYUGA', 'F', 22),
 (2012, 'MASON', 'MADISON', 'M', 28),
 (2012, 'AVA', 'JEFFERSON', 'F', 20),
 (2012, 'WILLIAM', 'CAYUGA', 'M', 45),
 (2012, 'ABIGAIL', 'MADISON', 'F', 19)]

In [0]:
filtered_df = df.filter((col("County") == "JEFFERSON") & (col("Gender") == "M"))
filtered_df.display()

Year,First Name,County,gender,Count
2012,NOAH,JEFFERSON,M,18


In [0]:
filter_df = df.filter(col("County").like("%ONONDAGA%"))
filter_df.display()

Year,First Name,County,gender,Count
2012,LIAM,ONONDAGA,M,36


In [0]:
df_with_double_count = df.withColumn("DoubleCount", col("Count") * 2)
df_with_double_count.display()

Year,First Name,County,gender,Count,DoubleCount
2012,JOHN,"ONEIDA,ORANGE",M,30,60
2012,EMMA,JEFFERSON,F,25,50
2012,NOAH,JEFFERSON,M,18,36
2012,SOPHIA,MADISON,F,12,24
2012,LIAM,ONONDAGA,M,36,72
2012,OLIVIA,CAYUGA,F,22,44
2012,MASON,MADISON,M,28,56
2012,AVA,JEFFERSON,F,20,40
2012,WILLIAM,CAYUGA,M,45,90
2012,ABIGAIL,MADISON,F,19,38


In [0]:
renamed_column_df = df.withColumnRenamed("Year", "BirthYear")
renamed_column_df.display()

BirthYear,First Name,County,gender,Count
2012,JOHN,"ONEIDA,ORANGE",M,30
2012,EMMA,JEFFERSON,F,25
2012,NOAH,JEFFERSON,M,18
2012,SOPHIA,MADISON,F,12
2012,LIAM,ONONDAGA,M,36
2012,OLIVIA,CAYUGA,F,22
2012,MASON,MADISON,M,28
2012,AVA,JEFFERSON,F,20
2012,WILLIAM,CAYUGA,M,45
2012,ABIGAIL,MADISON,F,19


In [0]:
sorted_df = df.orderBy(col("Count").desc())
sorted_df.display()

Year,First Name,County,gender,Count
2012,WILLIAM,CAYUGA,M,45
2012,LIAM,ONONDAGA,M,36
2012,JOHN,"ONEIDA,ORANGE",M,30
2012,MASON,MADISON,M,28
2012,EMMA,JEFFERSON,F,25
2012,OLIVIA,CAYUGA,F,22
2012,AVA,JEFFERSON,F,20
2012,ABIGAIL,MADISON,F,19
2012,NOAH,JEFFERSON,M,18
2012,SOPHIA,MADISON,F,12


In [0]:
from pyspark.sql.functions import split
split_county_df = df.withColumn("CountyArray", split(col("County"), ","))
split_county_df.display()

Year,First Name,County,gender,Count,CountyArray
2012,JOHN,"ONEIDA,ORANGE",M,30,"List(ONEIDA, ORANGE)"
2012,EMMA,JEFFERSON,F,25,List(JEFFERSON)
2012,NOAH,JEFFERSON,M,18,List(JEFFERSON)
2012,SOPHIA,MADISON,F,12,List(MADISON)
2012,LIAM,ONONDAGA,M,36,List(ONONDAGA)
2012,OLIVIA,CAYUGA,F,22,List(CAYUGA)
2012,MASON,MADISON,M,28,List(MADISON)
2012,AVA,JEFFERSON,F,20,List(JEFFERSON)
2012,WILLIAM,CAYUGA,M,45,List(CAYUGA)
2012,ABIGAIL,MADISON,F,19,List(MADISON)


In [0]:
split_and_explode_df = df.withColumn("County", explode(split(col("County"), ",")))
split_and_explode_df.display()
# Split the "County" column into an array and explode it into separate rows.

Year,First Name,County,gender,Count
2012,JOHN,ONEIDA,M,30
2012,JOHN,ORANGE,M,30
2012,EMMA,JEFFERSON,F,25
2012,NOAH,JEFFERSON,M,18
2012,SOPHIA,MADISON,F,12
2012,LIAM,ONONDAGA,M,36
2012,OLIVIA,CAYUGA,F,22
2012,MASON,MADISON,M,28
2012,AVA,JEFFERSON,F,20
2012,WILLIAM,CAYUGA,M,45


In [0]:
df_with_full_name = df.withColumn("FullName", concat_ws(", ", col("First Name"), col("County")))
df_with_full_name.display()

Year,First Name,County,gender,Count,FullName
2012,JOHN,"ONEIDA,ORANGE",M,30,"JOHN, ONEIDA,ORANGE"
2012,EMMA,JEFFERSON,F,25,"EMMA, JEFFERSON"
2012,NOAH,JEFFERSON,M,18,"NOAH, JEFFERSON"
2012,SOPHIA,MADISON,F,12,"SOPHIA, MADISON"
2012,LIAM,ONONDAGA,M,36,"LIAM, ONONDAGA"
2012,OLIVIA,CAYUGA,F,22,"OLIVIA, CAYUGA"
2012,MASON,MADISON,M,28,"MASON, MADISON"
2012,AVA,JEFFERSON,F,20,"AVA, JEFFERSON"
2012,WILLIAM,CAYUGA,M,45,"WILLIAM, CAYUGA"
2012,ABIGAIL,MADISON,F,19,"ABIGAIL, MADISON"


In [0]:
avg_count_by_gender = df.groupBy("Gender").agg({"Count": "avg"})
avg_count_by_gender.display()

Gender,avg(Count)
M,31.4
F,19.6


In [0]:
window_spec = Window.partitionBy("Gender").orderBy(col("Count").desc())
ranked_df = df.withColumn("Rank", rank().over(window_spec))
ranked_df.display()

Year,First Name,County,gender,Count,Rank
2012,EMMA,JEFFERSON,F,25,1
2012,OLIVIA,CAYUGA,F,22,2
2012,AVA,JEFFERSON,F,20,3
2012,ABIGAIL,MADISON,F,19,4
2012,SOPHIA,MADISON,F,12,5
2012,WILLIAM,CAYUGA,M,45,1
2012,LIAM,ONONDAGA,M,36,2
2012,JOHN,"ONEIDA,ORANGE",M,30,3
2012,MASON,MADISON,M,28,4
2012,NOAH,JEFFERSON,M,18,5


In [0]:
pivoted_df = df.groupBy("County").pivot("Gender").agg({"Count": "sum"})
pivoted_df.display()

County,F,M
MADISON,31.0,28
JEFFERSON,45.0,18
ONONDAGA,,36
CAYUGA,22.0,45
"ONEIDA,ORANGE",,30


In [0]:
diff_df = df.withColumn("CountDiff", col("Count") - lag(col("Count")).over(window_spec))
diff_df.display()

Year,First Name,County,gender,Count,CountDiff
2012,EMMA,JEFFERSON,F,25,
2012,OLIVIA,CAYUGA,F,22,-3.0
2012,AVA,JEFFERSON,F,20,-2.0
2012,ABIGAIL,MADISON,F,19,-1.0
2012,SOPHIA,MADISON,F,12,-7.0
2012,WILLIAM,CAYUGA,M,45,
2012,LIAM,ONONDAGA,M,36,-9.0
2012,JOHN,"ONEIDA,ORANGE",M,30,-6.0
2012,MASON,MADISON,M,28,-2.0
2012,NOAH,JEFFERSON,M,18,-10.0


In [0]:
updated_df = df.withColumn("Count", when(col("Count") < 20, 20).otherwise(col("Count")))
updated_df.display()

Year,First Name,County,gender,Count
2012,JOHN,"ONEIDA,ORANGE",M,30
2012,EMMA,JEFFERSON,F,25
2012,NOAH,JEFFERSON,M,20
2012,SOPHIA,MADISON,F,20
2012,LIAM,ONONDAGA,M,36
2012,OLIVIA,CAYUGA,F,22
2012,MASON,MADISON,M,28
2012,AVA,JEFFERSON,F,20
2012,WILLIAM,CAYUGA,M,45
2012,ABIGAIL,MADISON,F,20


In [0]:
def split_county(row):
    year = row["Year"]
    first_name = row["First Name"]
    counties = row["County"]
    gender = row["gender"]
    count = row["Count"]
    return [(year, first_name, county, gender, count) for county in counties]


In [0]:
flattened_df = df.rdd.flatMap(split_county).toDF(schema=schema)

In [0]:
flattened_df.display()

Year,First Name,County,gender,Count
2012,JOHN,O,M,30
2012,JOHN,N,M,30
2012,JOHN,E,M,30
2012,JOHN,I,M,30
2012,JOHN,D,M,30
2012,JOHN,A,M,30
2012,JOHN,",",M,30
2012,JOHN,O,M,30
2012,JOHN,R,M,30
2012,JOHN,A,M,30


In [0]:
sc.parallelize([2, 3, 4]).flatMap(lambda x: [x,x,x]).collect()

Out[32]: [2, 2, 2, 3, 3, 3, 4, 4, 4]

In [0]:
sc.parallelize([1,2,3]).map(lambda x: [x,x,x]).collect()

Out[33]: [[1, 1, 1], [2, 2, 2], [3, 3, 3]]

In [0]:
sc.parallelize([2, 3, 4]).flatMap(lambda x: [x,x,x])

Out[34]: PythonRDD[65] at RDD at PythonRDD.scala:58

In [0]:
#02/11/2023

In [0]:
updated_df = df.withColumn("Count", when(col("Count") < 20, 20).otherwise(col("Count")))
updated_df.display()

Year,First Name,County,gender,Count
2012,JOHN,"ONEIDA,ORANGE",M,30
2012,EMMA,JEFFERSON,F,25
2012,NOAH,JEFFERSON,M,20
2012,SOPHIA,MADISON,F,20
2012,LIAM,ONONDAGA,M,36
2012,OLIVIA,CAYUGA,F,22
2012,MASON,MADISON,M,28
2012,AVA,JEFFERSON,F,20
2012,WILLIAM,CAYUGA,M,45
2012,ABIGAIL,MADISON,F,20


In [0]:
diff_df = df.withColumn("CountDiff", col("Count") - lag(col("Count")).over(window_spec))
diff_df.display()

Year,First Name,County,gender,Count,CountDiff
2012,OLIVIA,CAYUGA,F,22,
2012,WILLIAM,CAYUGA,M,45,23.0
2012,EMMA,JEFFERSON,F,25,
2012,NOAH,JEFFERSON,M,18,-7.0
2012,AVA,JEFFERSON,F,20,2.0
2012,SOPHIA,MADISON,F,12,
2012,MASON,MADISON,M,28,16.0
2012,ABIGAIL,MADISON,F,19,-9.0
2012,JOHN,"ONEIDA,ORANGE",M,30,
2012,LIAM,ONONDAGA,M,36,


In [0]:
total_count = df.selectExpr("sum(Count) as TotalCount").collect()[0].TotalCount
percentage_df = df.withColumn("Percentage", (col("Count") / total_count) * 100)
percentage_df.display()

Year,First Name,County,gender,Count,Percentage
2012,JOHN,"ONEIDA,ORANGE",M,30,11.76470588235294
2012,EMMA,JEFFERSON,F,25,9.803921568627452
2012,NOAH,JEFFERSON,M,18,7.0588235294117645
2012,SOPHIA,MADISON,F,12,4.705882352941177
2012,LIAM,ONONDAGA,M,36,14.117647058823527
2012,OLIVIA,CAYUGA,F,22,8.627450980392156
2012,MASON,MADISON,M,28,10.980392156862743
2012,AVA,JEFFERSON,F,20,7.84313725490196
2012,WILLIAM,CAYUGA,M,45,17.647058823529413
2012,ABIGAIL,MADISON,F,19,7.450980392156863


In [0]:
age_grouped_df = df.withColumn("AgeGroup",
    when(col("Year") <= 2000, "Below 2000")
    .when((col("Year") > 2000) & (col("Year") <= 2010), "2001-2010")
    .otherwise("Above 2010"))
age_grouped_df.display()

Year,First Name,County,gender,Count,AgeGroup
2012,JOHN,"ONEIDA,ORANGE",M,30,Above 2010
2012,EMMA,JEFFERSON,F,25,Above 2010
2012,NOAH,JEFFERSON,M,18,Above 2010
2012,SOPHIA,MADISON,F,12,Above 2010
2012,LIAM,ONONDAGA,M,36,Above 2010
2012,OLIVIA,CAYUGA,F,22,Above 2010
2012,MASON,MADISON,M,28,Above 2010
2012,AVA,JEFFERSON,F,20,Above 2010
2012,WILLIAM,CAYUGA,M,45,Above 2010
2012,ABIGAIL,MADISON,F,19,Above 2010


In [0]:
window_spec = Window.partitionBy("Gender").orderBy("Year")
count_diff_next_df = df.withColumn("CountDiffNext", lead(col("Count")).over(window_spec))
count_diff_next_df.display()

Year,First Name,County,gender,Count,CountDiffNext
2012,EMMA,JEFFERSON,F,25,12.0
2012,SOPHIA,MADISON,F,12,22.0
2012,OLIVIA,CAYUGA,F,22,20.0
2012,AVA,JEFFERSON,F,20,19.0
2012,ABIGAIL,MADISON,F,19,
2012,JOHN,"ONEIDA,ORANGE",M,30,18.0
2012,NOAH,JEFFERSON,M,18,36.0
2012,LIAM,ONONDAGA,M,36,28.0
2012,MASON,MADISON,M,28,45.0
2012,WILLIAM,CAYUGA,M,45,


In [0]:
filtered_gender_df = df.filter((col("Gender") == "M") | (col("Gender") == "F"))
filtered_gender_df.display()

Year,First Name,County,gender,Count
2012,JOHN,"ONEIDA,ORANGE",M,30
2012,EMMA,JEFFERSON,F,25
2012,NOAH,JEFFERSON,M,18
2012,SOPHIA,MADISON,F,12
2012,LIAM,ONONDAGA,M,36
2012,OLIVIA,CAYUGA,F,22
2012,MASON,MADISON,M,28
2012,AVA,JEFFERSON,F,20
2012,WILLIAM,CAYUGA,M,45
2012,ABIGAIL,MADISON,F,19


In [0]:
average_count_by_age_group = age_grouped_df.groupBy("AgeGroup").agg({"Count": "avg"})
average_count_by_age_group.display()

AgeGroup,avg(Count)
Above 2010,25.5


In [0]:
high_count_threshold = 30
df_with_high_count = df.withColumn("IsHighCount", when(col("Count") > high_count_threshold, "Yes").otherwise("No"))
df_with_high_count.display()

Year,First Name,County,gender,Count,IsHighCount
2012,JOHN,"ONEIDA,ORANGE",M,30,No
2012,EMMA,JEFFERSON,F,25,No
2012,NOAH,JEFFERSON,M,18,No
2012,SOPHIA,MADISON,F,12,No
2012,LIAM,ONONDAGA,M,36,Yes
2012,OLIVIA,CAYUGA,F,22,No
2012,MASON,MADISON,M,28,No
2012,AVA,JEFFERSON,F,20,No
2012,WILLIAM,CAYUGA,M,45,Yes
2012,ABIGAIL,MADISON,F,19,No


In [0]:
window_spec = Window.partitionBy("County").orderBy("Year")
count_diff_previous_df = df.withColumn("CountDiffPrevious", col("Count") - lag(col("Count")).over(window_spec))
count_diff_previous_df.display()

Year,First Name,County,gender,Count,CountDiffPrevious
2012,OLIVIA,CAYUGA,F,22,
2012,WILLIAM,CAYUGA,M,45,23.0
2012,EMMA,JEFFERSON,F,25,
2012,NOAH,JEFFERSON,M,18,-7.0
2012,AVA,JEFFERSON,F,20,2.0
2012,SOPHIA,MADISON,F,12,
2012,MASON,MADISON,M,28,16.0
2012,ABIGAIL,MADISON,F,19,-9.0
2012,JOHN,"ONEIDA,ORANGE",M,30,
2012,LIAM,ONONDAGA,M,36,


In [0]:
# 10. Calculate the cumulative sum of "Count" within each county group.
window_spec = Window.partitionBy("County").orderBy("Year")
cumulative_sum_by_county_df = df.withColumn("CumulativeSumByCounty", sum(col("Count")).over(window_spec))
cumulative_sum_by_county_df.display()

Year,First Name,County,gender,Count,CumulativeSumByCounty
2012,OLIVIA,CAYUGA,F,22,67
2012,WILLIAM,CAYUGA,M,45,67
2012,EMMA,JEFFERSON,F,25,63
2012,NOAH,JEFFERSON,M,18,63
2012,AVA,JEFFERSON,F,20,63
2012,SOPHIA,MADISON,F,12,59
2012,MASON,MADISON,M,28,59
2012,ABIGAIL,MADISON,F,19,59
2012,JOHN,"ONEIDA,ORANGE",M,30,30
2012,LIAM,ONONDAGA,M,36,36
