In [1]:
!pip install pyspark



In [2]:
!pip install pandas



In [2]:
import findspark
findspark.init()

# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, columns)
df.show()


+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       James|     Sales|  3000|
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
|       Maria|   Finance|  3000|
+------------+----------+------+



In [3]:
# Contoh operasi transformasi DataFrame
from pyspark.sql.functions import mean, max, sum, min

print ("Data Tabel Pegawai dan Salary")
df_select = df.select('EmployeeName', 'Salary')
df_select.show()

# Melakukan filter untuk menampilkan data dengan salary > 3000
print("Data Pegawai dengan Salary lebih dari 3000")
df_filter = df.filter(df['Salary'] > 3000)
df_filter.show()

# Menghitung rata-rata dari data salary untuk tiap departemen
print("Rata=rata Salary Untuk Tiap Departement")
df_groupBy = df.groupBy('Department').avg('Salary')
df_groupBy.show()

print("Insight Dataset")
ringkasan_df = df.groupBy('Department').agg(
    mean("Salary").alias("Rata-rata salary"),
    max("Salary").alias("Salarry Tertinggi"),
    min("Salary").alias("Salary Terendah"),
    sum("Salary").alias("Total Salary")
)
ringkasan_df.show()

Data Tabel Pegawai dan Salary
+------------+------+
|EmployeeName|Salary|
+------------+------+
|       James|  3000|
|     Michael|  4600|
|      Robert|  4100|
|       Maria|  3000|
+------------+------+

Data Pegawai dengan Salary lebih dari 3000
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
+------------+----------+------+

Rata=rata Salary Untuk Tiap Departement
+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3900.0|
|   Finance|     3000.0|
+----------+-----------+

Insight Dataset
+----------+----------------+-----------------+---------------+------------+
|Department|Rata-rata salary|Salarry Tertinggi|Salary Terendah|Total Salary|
+----------+----------------+-----------------+---------------+------------+
|     Sales|          3900.0|             4600|           3000|       11700|
|   Finance|          3000.0|       

In [8]:
# Contoh manipulasi tipe data kompleks

df = df.withColumn('SalaryBonus', df['Salary'] * 0.1)
df = df.withColumn('TotalCompensation', df['Salary'] + df['SalaryBonus'])
df.show()

+------------+----------+------+-----------+-----------------+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|
+------------+----------+------+-----------+-----------------+
|       James|     Sales|  3000|      300.0|           3300.0|
|     Michael|     Sales|  4600|      460.0|           5060.0|
|      Robert|     Sales|  4100|      410.0|           4510.0|
|       Maria|   Finance|  3000|      300.0|           3300.0|
+------------+----------+------+-----------+-----------------+



In [4]:
# Contoh menggunakan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+----------+------+----+
|EmployeeName|Department|Salary|Rank|
+------------+----------+------+----+
|       Maria|   Finance|  3000|   1|
|       James|     Sales|  3000|   1|
|      Robert|     Sales|  4100|   2|
|     Michael|     Sales|  4600|   3|
+------------+----------+------+----+



In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Datakaggle").getOrCreate()

# Untuk membaca file
file_path = "/C:/Users/Najmi Mukia Barkah/Documents/onlinefoods.csv"
df = spark.read.csv(file_path, header = True, inferSchema = True)

df.show()

+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|_c12|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
| 20|Female|        Single|       Student|      No Income|             Post Graduate|          4| 12.9766|  77.5993|  560001|   Yes| Positive| Yes|
| 24|Female|        Single|       Student| Below Rs.10000|                  Graduate|          3|  12.977|  77.5773|  560009|   Yes| Positive| Yes|
| 22|  Male|        Single|       Student| Below Rs.10000|             Post Graduate|          3| 12.9551|  77.6593|  560017|   Yes|Negative | Yes|
| 22|Female|        Single|       Student|      No Income|                  Graduate|          6| 12.9473|  77.5

In [7]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Monthly Income: string (nullable = true)
 |-- Educational Qualifications: string (nullable = true)
 |-- Family size: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Pin code: integer (nullable = true)
 |-- Output: string (nullable = true)
 |-- Feedback: string (nullable = true)
 |-- _c12: string (nullable = true)



In [8]:
df.groupBy("Marital Status").pivot("Gender").sum("Pin code").show()

+-----------------+--------+--------+
|   Marital Status|  Female|    Male|
+-----------------+--------+--------+
|Prefer not to say| 2800091| 3920444|
|          Married|27441934|33042912|
|           Single|62724419|87365764|
+-----------------+--------+--------+

