<a href="https://colab.research.google.com/github/LikaAlfi/BigData-Prak3/blob/main/bigdatatugas3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# Tugas 1
# Import modul yang diperlukan dari pyspark
from pyspark.sql import SparkSession

# Membuat SparkSession
spark = SparkSession.builder \
    .appName("Contoh DataFrame Sederhana") \
    .getOrCreate()

# Membuat data sederhana dalam bentuk list of Row
data = [
    (1, "Arjuna", 35),
    (2, "Lintang", 24),
    (3, "Naruna", 22),
    (4, "Nakula", 30)
]

# Mengubah data menjadi DataFrame Spark
df = spark.createDataFrame(data, ["id", "nama", "umur"])

# Menampilkan isi DataFrame
print("Isi DataFrame:")
df.show()

# Menampilkan skema (struktur kolom dan tipe data)
print("Skema DataFrame:")
df.printSchema()

# Menampilkan hanya kolom 'nama'
print("Ambil kolom nama:")
df.select("nama").show()

# Filter data dengan kondisi (umur > 25)
print("Filter umur > 25:")
df.filter(df.umur > 25).show()

# Menghitung jumlah baris dalam DataFrame
print("Jumlah baris DataFrame:")
print(df.count())


Isi DataFrame:
+---+-------+----+
| id|   nama|umur|
+---+-------+----+
|  1| Arjuna|  35|
|  2|Lintang|  24|
|  3| Naruna|  22|
|  4| Nakula|  30|
+---+-------+----+

Skema DataFrame:
root
 |-- id: long (nullable = true)
 |-- nama: string (nullable = true)
 |-- umur: long (nullable = true)

Ambil kolom nama:
+-------+
|   nama|
+-------+
| Arjuna|
|Lintang|
| Naruna|
| Nakula|
+-------+

Filter umur > 25:
+---+------+----+
| id|  nama|umur|
+---+------+----+
|  1|Arjuna|  35|
|  4|Nakula|  30|
+---+------+----+

Jumlah baris DataFrame:
4


In [36]:
# Tugas 2
# Import modul yang diperlukan
from pyspark.sql import SparkSession

# Membuat SparkSession
spark = SparkSession.builder \
    .appName("Transformasi DataFrame Tanpa Row") \
    .getOrCreate()

# Dataset sederhana (pakai tuple, tanpa Row)
data = [
    (1, "Dika", 23, "IT", 5000000),
    (2, "Dario", 30, "HR", 4500000),
    (3, "Eli", 27, "IT", 6000000),
    (4, "Mika", 22, "Finance", 4000000),
    (5, "Harfa", 35, "Finance", 7000000)
]

# Membuat DataFrame dengan nama kolom
df = spark.createDataFrame(data, ["id", "nama", "umur", "departemen", "gaji"])

# Tampilkan isi DataFrame
print("Isi DataFrame:")
df.show()

# Contoh operasi transformasi DataFrame
# ambil hanya kolom nama dan gaji
df.select('nama', 'gaji').show()

# filter umur > 25
df.filter(df['umur'] > 25).show()

# rata-rata gaji per departemen
df.groupBy('departemen').avg('gaji').show()

# gaji maksimum per departemen
df.groupBy('departemen').max('gaji').show()

# total gaji per departemen
df.groupBy('departemen').sum('gaji').show()


Isi DataFrame:
+---+-----+----+----------+-------+
| id| nama|umur|departemen|   gaji|
+---+-----+----+----------+-------+
|  1| Dika|  23|        IT|5000000|
|  2|Dario|  30|        HR|4500000|
|  3|  Eli|  27|        IT|6000000|
|  4| Mika|  22|   Finance|4000000|
|  5|Harfa|  35|   Finance|7000000|
+---+-----+----+----------+-------+

+-----+-------+
| nama|   gaji|
+-----+-------+
| Dika|5000000|
|Dario|4500000|
|  Eli|6000000|
| Mika|4000000|
|Harfa|7000000|
+-----+-------+

+---+-----+----+----------+-------+
| id| nama|umur|departemen|   gaji|
+---+-----+----+----------+-------+
|  2|Dario|  30|        HR|4500000|
|  3|  Eli|  27|        IT|6000000|
|  5|Harfa|  35|   Finance|7000000|
+---+-----+----+----------+-------+

+----------+---------+
|departemen|avg(gaji)|
+----------+---------+
|        HR|4500000.0|
|        IT|5500000.0|
|   Finance|5500000.0|
+----------+---------+

+----------+---------+
|departemen|max(gaji)|
+----------+---------+
|        HR|  4500000|
|       

In [37]:
# Tugas 3
# Import modul yang diperlukan
from pyspark.sql import SparkSession

# Membuat SparkSession
spark = SparkSession.builder \
    .appName("Manipulasi DataFrame Tanpa Kategori Umur") \
    .getOrCreate()

# Dataset sederhana
data = [
    (1, "Dika", 23, "IT", 5000000),
    (2, "Dario", 30, "HR", 4500000),
    (3, "Eli", 27, "IT", 6000000),
    (4, "Mika", 22, "Finance", 4000000),
    (5, "Harfa", 35, "Finance", 7000000)
]

# Membuat DataFrame dengan nama kolom
df = spark.createDataFrame(data, ["id", "nama", "umur", "departemen", "gaji"])

print("Data asli:")
df.show()

# Tambahkan kolom pajak (10% dari gaji)
df_with_tax = df.withColumn("pajak", df["gaji"] * 0.1)

# Tambahkan kolom gaji bersih (gaji - pajak)
df_final = df_with_tax.withColumn("gaji_bersih", df_with_tax["gaji"] - df_with_tax["pajak"])

print("Data setelah manipulasi (tanpa kategori umur):")
df_final.show()



Data asli:
+---+-----+----+----------+-------+
| id| nama|umur|departemen|   gaji|
+---+-----+----+----------+-------+
|  1| Dika|  23|        IT|5000000|
|  2|Dario|  30|        HR|4500000|
|  3|  Eli|  27|        IT|6000000|
|  4| Mika|  22|   Finance|4000000|
|  5|Harfa|  35|   Finance|7000000|
+---+-----+----+----------+-------+

Data setelah manipulasi (tanpa kategori umur):
+---+-----+----+----------+-------+--------+-----------+
| id| nama|umur|departemen|   gaji|   pajak|gaji_bersih|
+---+-----+----+----------+-------+--------+-----------+
|  1| Dika|  23|        IT|5000000|500000.0|  4500000.0|
|  2|Dario|  30|        HR|4500000|450000.0|  4050000.0|
|  3|  Eli|  27|        IT|6000000|600000.0|  5400000.0|
|  4| Mika|  22|   Finance|4000000|400000.0|  3600000.0|
|  5|Harfa|  35|   Finance|7000000|700000.0|  6300000.0|
+---+-----+----+----------+-------+--------+-----------+



In [38]:
# Tugas 4
# Import modul yang diperlukan
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Membuat SparkSession
spark = SparkSession.builder \
    .appName("Window Function Contoh") \
    .getOrCreate()

# Dataset sederhana
data = [
    (1, "Dika", 23, "IT", 5000000),
    (2, "Dario", 30, "HR", 4500000),
    (3, "Eli", 27, "IT", 6000000),
    (4, "Mika", 22, "Finance", 4000000),
    (5, "Harfa", 35, "Finance", 7000000)
]

# Membuat DataFrame
df = spark.createDataFrame(data, ["id", "nama", "umur", "departemen", "gaji"])

print("Isi DataFrame:")
df.show()

# Running Total Gaji per Departemen
windowSpecRunning = Window.partitionBy("departemen").orderBy("id")
df_running = df.withColumn("running_total_gaji", F.sum("gaji").over(windowSpecRunning))

print("Running total gaji per departemen:")
df_running.show()

# Ranking Karyawan berdasarkan Gaji dalam Departemen
windowSpecRank = Window.partitionBy("departemen").orderBy(F.desc("gaji"))
df_rank = df.withColumn("rank_gaji", F.rank().over(windowSpecRank))

print("Ranking karyawan berdasarkan gaji dalam departemen:")
df_rank.show()


Isi DataFrame:
+---+-----+----+----------+-------+
| id| nama|umur|departemen|   gaji|
+---+-----+----+----------+-------+
|  1| Dika|  23|        IT|5000000|
|  2|Dario|  30|        HR|4500000|
|  3|  Eli|  27|        IT|6000000|
|  4| Mika|  22|   Finance|4000000|
|  5|Harfa|  35|   Finance|7000000|
+---+-----+----+----------+-------+

Running total gaji per departemen:
+---+-----+----+----------+-------+------------------+
| id| nama|umur|departemen|   gaji|running_total_gaji|
+---+-----+----+----------+-------+------------------+
|  4| Mika|  22|   Finance|4000000|           4000000|
|  5|Harfa|  35|   Finance|7000000|          11000000|
|  2|Dario|  30|        HR|4500000|           4500000|
|  1| Dika|  23|        IT|5000000|           5000000|
|  3|  Eli|  27|        IT|6000000|          11000000|
+---+-----+----+----------+-------+------------------+

Ranking karyawan berdasarkan gaji dalam departemen:
+---+-----+----+----------+-------+---------+
| id| nama|umur|departemen|   g

In [39]:
# Tugas 5
import kagglehub

# Mengunduh dataset dan ambil path folder penyimpanannya
path = kagglehub.dataset_download("zadafiyabhrami/global-crocodile-species-dataset")
print("Path ke file dataset:", path)


Using Colab cache for faster access to the 'global-crocodile-species-dataset' dataset.
Path ke file dataset: /kaggle/input/global-crocodile-species-dataset


In [40]:
import os

print("Isi folder dataset:")
print(os.listdir(path))


Isi folder dataset:
['crocodile_dataset.csv']


In [42]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Buat SparkSession
spark = SparkSession.builder \
    .appName("Eksplorasi Crocodile Dataset") \
    .getOrCreate()

# Load dataset (ganti sesuai path hasil download kagglehub)
csv_file = path + "/crocodile_dataset.csv"

df = spark.read.csv(csv_file, header=True, inferSchema=True)

print("=== Jumlah Baris & Kolom ===")
print("Jumlah baris:", df.count())
print("Jumlah kolom:", len(df.columns))

print("\n=== Skema DataFrame ===")
df.printSchema()

print("\n=== 5 Data Teratas ===")
df.show(5, truncate=False)

# Filter data berdasarkan lokasi (Country/Region)
print("\n=== Data dari Africa ===")
df.filter(df["Country/Region"] == "Africa").show(5)

# Agregasi jumlah spesies per Genus
print("\n=== Jumlah Observasi per Genus ===")
df.groupBy("Genus").count().show()

# Agregasi rata-rata panjang buaya per Genus
print("\n=== Rata-rata Panjang Buaya (m) per Genus ===")
df.groupBy("Genus").agg(F.avg("Observed Length (m)").alias("Rata2_Panjang")).show()

# Agregasi rata-rata berat buaya per Habitat Type
print("\n=== Rata-rata Berat Buaya (kg) per Habitat ===")
df.groupBy("Habitat Type").agg(F.avg("Observed Weight (kg)").alias("Rata2_Berat")).show()

# Tambahkan kolom baru (bonus panjang 10%)
print("\n=== Tambahkan Kolom LengthBonus (10% dari panjang) ===")
df2 = df.withColumn("LengthBonus", df["Observed Length (m)"] * 0.1)
df2.select("Genus", "Observed Length (m)", "LengthBonus").show(5)


=== Jumlah Baris & Kolom ===
Jumlah baris: 1000
Jumlah kolom: 15

=== Skema DataFrame ===
root
 |-- Observation ID: integer (nullable = true)
 |-- Common Name: string (nullable = true)
 |-- Scientific Name: string (nullable = true)
 |-- Family: string (nullable = true)
 |-- Genus: string (nullable = true)
 |-- Observed Length (m): double (nullable = true)
 |-- Observed Weight (kg): double (nullable = true)
 |-- Age Class: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Date of Observation: string (nullable = true)
 |-- Country/Region: string (nullable = true)
 |-- Habitat Type: string (nullable = true)
 |-- Conservation Status: string (nullable = true)
 |-- Observer Name: string (nullable = true)
 |-- Notes: string (nullable = true)


=== 5 Data Teratas ===
+--------------+----------------------------------+----------------------+------------+----------+-------------------+--------------------+---------+-------+-------------------+--------------+----------------+------