Pengenalan Spark DataFrames

In [3]:
# Membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Praktikum 3').getOrCreate()

data = [('Hanif', 'Manajer', 3000),
        ('Suki', 'Admin', 4500),
        ('Alex', 'Penjualan', 4200),
        ('Andra', 'Penjualan', 2900)]
columns = ['Nama Karyawan', 'Devisi', 'Gaji ($)']

df = spark.createDataFrame(data, schema=columns)
df.show()

+-------------+---------+--------+
|Nama Karyawan|   Devisi|Gaji ($)|
+-------------+---------+--------+
|        Hanif|  Manajer|    3000|
|         Suki|    Admin|    4500|
|         Alex|Penjualan|    4200|
|        Andra|Penjualan|    2900|
+-------------+---------+--------+



Transformasi Dasar dengan DataFrames

In [9]:
# Membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Praktikum 3').getOrCreate()

data = [('Hanif', 'Manajer', 3000),
        ('Suki', 'Admin', 4500),
        ('Alex', 'Penjualan', 4200),
        ('Andra', 'Penjualan', 2900)]
columns = ['Nama Karyawan', 'Devisi', 'Gaji ($)']

# Contoh operasi transformasi DataFrame
df.select('Nama Karyawan', 'Gaji ($)').show()
df.filter(df['Gaji ($)'] > 3000).show()
df.groupBy('Devisi').avg('Gaji ($)').show()
df.groupBy().sum("Gaji ($)").show()

+-------------+--------+
|Nama Karyawan|Gaji ($)|
+-------------+--------+
|        Hanif|    3000|
|         Suki|    4500|
|         Alex|    4200|
|        Andra|    2900|
+-------------+--------+

+-------------+---------+--------+
|Nama Karyawan|   Devisi|Gaji ($)|
+-------------+---------+--------+
|         Suki|    Admin|    4500|
|         Alex|Penjualan|    4200|
+-------------+---------+--------+

+---------+-------------+
|   Devisi|avg(Gaji ($))|
+---------+-------------+
|    Admin|       4500.0|
|  Manajer|       3000.0|
|Penjualan|       3550.0|
+---------+-------------+

+-------------+
|sum(Gaji ($))|
+-------------+
|        14600|
+-------------+



Belajar dengan Tipe Data Kompleks

In [12]:
# Membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Praktikum 3').getOrCreate()

data = [('Hanif', 'Manajer', 3000),
        ('Suki', 'Admin', 4500),
        ('Alex', 'Penjualan', 4200),
        ('Andra', 'Penjualan', 2900)]
columns = ['Nama Karyawan', 'Devisi', 'Gaji ($)']

df = spark.createDataFrame(data, schema=columns)

# Contoh manipulasi tipe data kompleks
df = df.withColumn('Bonus Gaji', df['Gaji ($)'] * 0.1)
df1 = df.withColumn('Total Gaji', df['Gaji ($)'] + df['Bonus Gaji'])

df.show()
df1.show()

+-------------+---------+--------+----------+
|Nama Karyawan|   Devisi|Gaji ($)|Bonus Gaji|
+-------------+---------+--------+----------+
|        Hanif|  Manajer|    3000|     300.0|
|         Suki|    Admin|    4500|     450.0|
|         Alex|Penjualan|    4200|     420.0|
|        Andra|Penjualan|    2900|     290.0|
+-------------+---------+--------+----------+

+-------------+---------+--------+----------+----------+
|Nama Karyawan|   Devisi|Gaji ($)|Bonus Gaji|Total Gaji|
+-------------+---------+--------+----------+----------+
|        Hanif|  Manajer|    3000|     300.0|    3300.0|
|         Suki|    Admin|    4500|     450.0|    4950.0|
|         Alex|Penjualan|    4200|     420.0|    4620.0|
|        Andra|Penjualan|    2900|     290.0|    3190.0|
+-------------+---------+--------+----------+----------+



Operasi Dasar Lanjutan (Implementasi Window Function)

In [19]:
# Membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Praktikum 3').getOrCreate()

data = [('Hanif', 'Manajer', 3000),
        ('Suki', 'Admin', 4500),
        ('Alex', 'Penjualan', 4200),
        ('Andra', 'Penjualan', 2900)]
columns = ['Nama Karyawan', 'Devisi', 'Gaji ($)']

df = spark.createDataFrame(data, schema=columns)

# Contoh menggunakan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.orderBy('Gaji ($)')
df.withColumn('Tingkatan', F.rank().over(windowSpec)).show()

+-------------+---------+--------+---------+
|Nama Karyawan|   Devisi|Gaji ($)|Tingkatan|
+-------------+---------+--------+---------+
|        Andra|Penjualan|    2900|        1|
|        Hanif|  Manajer|    3000|        2|
|         Alex|Penjualan|    4200|        3|
|         Suki|    Admin|    4500|        4|
+-------------+---------+--------+---------+



Eksplorasi Lebih Lanjut

In [23]:
from pyspark.sql import functions as SparkFuncs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Praktikum").getOrCreate()

dataframe = spark.read.csv("/kaggle/datasets/genshin.csv", header=True, inferSchema=True)
dataframe.show(dataframe.count())

+--------------------+------+---------+-------+------+-----------+-------------+--------------------+---------------+--------------------+--------------------+------------+-------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+--------+---------+---------+-------+--------+--------+----------+---------+---

In [24]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

result = dataframe.select("character_name", "rarity", "region", "vision") \
    .filter(F.col("vision") == "Electro")

result.show(truncate=False)

+--------------------------+------+---------+-------+
|character_name            |rarity|region   |vision |
+--------------------------+------+---------+-------+
|Beidou                    |4     |Liyue    |Electro|
|Cyno                      |5     |Sumeru   |Electro|
|Dori                      |4     |Sumeru   |Electro|
|Fischl                    |4     |Mondstadt|Electro|
|Keqing                    |5     |Liyue    |Electro|
|Kujou Sara                |4     |Inazuma  |Electro|
|Kuki Shinobu              |4     |Inazuma  |Electro|
|Lisa                      |4     |Mondstadt|Electro|
|Raiden Shogun             |5     |Inazuma  |Electro|
|Razor                     |4     |Mondstadt|Electro|
|Traveler (Aether, Electro)|5     |NA       |Electro|
|Traveler (Lumine, Electro)|5     |NA       |Electro|
|Yae Miko                  |5     |Inazuma  |Electro|
+--------------------------+------+---------+-------+



In [25]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

result = dataframe.select("character_name", "rarity", "vision", "affiliation") \
    .filter(F.col("affiliation") == "Knights of Favonius")

result.show(truncate=False)

+--------------+------+-------+-------------------+
|character_name|rarity|vision |affiliation        |
+--------------+------+-------+-------------------+
|Albedo        |5     |Geo    |Knights of Favonius|
|Amber         |4     |Pyro   |Knights of Favonius|
|Eula          |5     |Cryo   |Knights of Favonius|
|Jean          |5     |Anemo  |Knights of Favonius|
|Kaeya         |4     |Cryo   |Knights of Favonius|
|Klee          |5     |Pyro   |Knights of Favonius|
|Lisa          |4     |Electro|Knights of Favonius|
|Mika          |4     |Cryo   |Knights of Favonius|
|Noelle        |4     |Geo    |Knights of Favonius|
|Sucrose       |4     |Anemo  |Knights of Favonius|
+--------------+------+-------+-------------------+



In [26]:
vision_count = dataframe.groupBy("vision").count().orderBy(F.desc("count"))

vision_common = vision_count.first()["vision"]
result = dataframe.filter(F.col("vision") == vision_common) \
                 .select("character_name", "rarity", "region", "vision")

result.show(truncate=False)

+--------------+------+---------+------+
|character_name|rarity|region   |vision|
+--------------+------+---------+------+
|Aloy          |5     |NA       |Cryo  |
|Charlotte     |4     |Fontaine |Cryo  |
|Chongyun      |4     |Liyue    |Cryo  |
|Diona         |4     |Mondstadt|Cryo  |
|Eula          |5     |Mondstadt|Cryo  |
|Freminet      |4     |Fontaine |Cryo  |
|Ganyu         |5     |Liyue    |Cryo  |
|Kaeya         |4     |Mondstadt|Cryo  |
|Kamisato Ayaka|5     |Inazuma  |Cryo  |
|Layla         |4     |Sumeru   |Cryo  |
|Mika          |4     |Mondstadt|Cryo  |
|Qiqi          |5     |Liyue    |Cryo  |
|Rosaria       |4     |Mondstadt|Cryo  |
|Shenhe        |5     |Liyue    |Cryo  |
|Wriothesley   |5     |Fontaine |Cryo  |
+--------------+------+---------+------+

