In [5]:
# Tugas 1 Buat DataFrame sederhana dan Operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Profesi').getOrCreate()

data = [('Ridwan', 'Gamer', 1000),
        ('Mila', 'Guru', 2000),
        ('Fauzan', 'Dokter', 5200),
        ('Ilham', 'Karyawan', 4000),
        ('Gayuh', 'Sales', 3000)]
columns = ['EmployeeName', 'Profesi', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+--------+------+
|EmployeeName| Profesi|Salary|
+------------+--------+------+
|      Ridwan|   Gamer|  1000|
|        Mila|    Guru|  2000|
|      Fauzan|  Dokter|  5200|
|       Ilham|Karyawan|  4000|
|       Gayuh|   Sales|  3000|
+------------+--------+------+



In [7]:
# Tugas 2 Gunakan Operasi filter, select, groupBy
# Operasi Transformasi DataFrame
df.select('EmployeeName', 'Salary').show()
df.filter(df['Salary'] > 3000).show()
df.groupBy('Profesi').avg('Salary').show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|      Ridwan|  1000|
|        Mila|  2000|
|      Fauzan|  5200|
|       Ilham|  4000|
|       Gayuh|  3000|
+------------+------+

+------------+--------+------+
|EmployeeName| Profesi|Salary|
+------------+--------+------+
|      Fauzan|  Dokter|  5200|
|       Ilham|Karyawan|  4000|
+------------+--------+------+

+--------+-----------+
| Profesi|avg(Salary)|
+--------+-----------+
|    Guru|     2000.0|
|   Gamer|     1000.0|
|   Sales|     3000.0|
|  Dokter|     5200.0|
|Karyawan|     4000.0|
+--------+-----------+



In [8]:
# Tugas 3 Eksplorasi bagaimana mengolah tipe data kompleks
# Manipulasi tipe data kompleks
df_with_bonus = df.withColumn('SalaryBonus', df['Salary'] * 0.1)
df_with_bonus.withColumn('TotalCompensation', df_with_bonus['Salary'] + df_with_bonus['SalaryBonus']).show()

+------------+--------+------+-----------+-----------------+
|EmployeeName| Profesi|Salary|SalaryBonus|TotalCompensation|
+------------+--------+------+-----------+-----------------+
|      Ridwan|   Gamer|  1000|      100.0|           1100.0|
|        Mila|    Guru|  2000|      200.0|           2200.0|
|      Fauzan|  Dokter|  5200|      520.0|           5720.0|
|       Ilham|Karyawan|  4000|      400.0|           4400.0|
|       Gayuh|   Sales|  3000|      300.0|           3300.0|
+------------+--------+------+-----------+-----------------+



In [10]:
# Tugas 4 Implementasikan window function untuk menghitung running totals atau rangkings.
# Menggunakan windows function
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Profesi').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+--------+------+----+
|EmployeeName| Profesi|Salary|Rank|
+------------+--------+------+----+
|      Fauzan|  Dokter|  5200|   1|
|      Ridwan|   Gamer|  1000|   1|
|        Mila|    Guru|  2000|   1|
|       Ilham|Karyawan|  4000|   1|
|       Gayuh|   Sales|  3000|   1|
+------------+--------+------+----+



Tugas 5:

Unduh dataset besar dari Kaggle atau sumber lainnya.
Input data csv yang telah di download, kemudian load dan simpan data ke dalam pyspark.
Setelah data berhasil di load menggunakan pyspark, lakukan manipulasi data untuk memperoleh informasi yang dibutuhkan

In [11]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [13]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/abiyyurasyiq/mobile-legends-google-play-reviews")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: Muhamad Ridwansah
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/abiyyurasyiq/mobile-legends-google-play-reviews
Downloading mobile-legends-google-play-reviews.zip to ./mobile-legends-google-play-reviews


100%|██████████| 3.54M/3.54M [00:00<00:00, 403MB/s]







In [16]:
# Load Dataset
from pyspark.sql import SparkSession

# Buat SparkSession
spark = SparkSession.builder.appName("mobile_legends_reviews").getOrCreate()

# Load dataset
df = spark.read.csv("/content/mobile-legends-google-play-reviews/mobile_legends_reviews.csv", header=True, inferSchema=True)

# Lihat schema dan 5 data awal
df.printSchema()
df.show(5)

root
 |-- reviewId: string (nullable = true)
 |-- userName: string (nullable = true)
 |-- content: string (nullable = true)
 |-- score: string (nullable = true)
 |-- thumbsUpCount: string (nullable = true)
 |-- at: string (nullable = true)

+--------------------+-------------+--------------------+-----+-------------+-------------------+
|            reviewId|     userName|             content|score|thumbsUpCount|                 at|
+--------------------+-------------+--------------------+-----+-------------+-------------------+
|d9b3706c-a29d-466...|A Google user|Love this game. H...|    5|            0|2025-06-10 13:20:23|
|cc717825-ea2b-444...|A Google user|the graphics? 10/...|    5|            0|2025-06-10 13:19:39|
|f3c5353d-65f8-486...|A Google user|Horrible. your ma...|    1|            0|2025-06-10 13:18:52|
|5d2611d3-dfda-41b...|A Google user|dark system. fix ...|    1|            0|2025-06-10 13:15:51|
|38f96ac5-c264-404...|A Google user|This game has a s...|    1|          

In [17]:
# Menampilkan jumlah data
print("Jumlah data:", df.count())

Jumlah data: 52658


In [20]:
# Nilai rata-rata score dan thumbsUpCount
df.selectExpr(
    "avg(cast(score as int)) as rata_score",
    "avg(cast(thumbsUpCount as int)) as rata_thumbsUp"
).show()

+------------------+-----------------+
|        rata_score|    rata_thumbsUp|
+------------------+-----------------+
|3.1614073960880194|4.706089706947144|
+------------------+-----------------+



In [28]:
from pyspark.sql import functions as F

df = df.withColumn("score", F.col("score").cast("int"))
df = df.withColumn("thumbsUpCount", F.col("thumbsUpCount").cast("int"))

# Rata-rata skor review berdasarkan userName
df.groupBy("userName") \
  .agg(
      F.avg("score").alias("avg_score"),
      F.avg("thumbsUpCount").alias("avg_thumbsUp")
  ).show(10)

# Rata-rata skor review berdasarkan tanggal (at)
df.groupBy("at") \
  .agg(
      F.avg("score").alias("avg_score"),
      F.avg("thumbsUpCount").alias("avg_thumbsUp")
  ).show(10)


+--------------------+-----------------+-----------------+
|            userName|        avg_score|     avg_thumbsUp|
+--------------------+-----------------+-----------------+
|                   3|             NULL|             NULL|
|                NULL|             NULL|             NULL|
|       A Google user|3.163799594555459|4.461165142031777|
|                   2|             NULL|             NULL|
| Angelica Ann Dingal|              1.0|              0.0|
| John marvin Bordado|              3.0|              0.0|
|       David Naldoza|              1.0|              1.0|
|     thanura rukshan|              1.0|              1.0|
|Ziv Nathan De castro|              1.0|              0.0|
|           Pepsi Man|              5.0|              0.0|
+--------------------+-----------------+-----------------+
only showing top 10 rows

+-------------------+---------+------------+
|                 at|avg_score|avg_thumbsUp|
+-------------------+---------+------------+
|2025-06-10 1

In [29]:
from pyspark.sql import functions as F

df = df.withColumn("score", F.col("score").cast("int"))
df = df.withColumn("thumbsUpCount", F.col("thumbsUpCount").cast("int"))

df = df.withColumn(
    "sentiment",
    F.when(F.col("score") >= 3, "positif").otherwise("negatif")
)

# Hitung rata-rata berdasarkan positif negatif
df.groupBy("sentiment") \
  .agg(
      F.avg("score").alias("avg_score"),
      F.avg("thumbsUpCount").alias("avg_thumbsUp")
  ).show()

+---------+------------------+-----------------+
|sentiment|         avg_score|     avg_thumbsUp|
+---------+------------------+-----------------+
|  negatif|1.0929371307644828|6.596922672277748|
|  positif| 4.742703067071115|3.252308729356252|
+---------+------------------+-----------------+

