<a href="https://colab.research.google.com/github/JabbarHakim/BIG-DATA/blob/main/Big_Data_(Data_Processing_with_ApacheSpark).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Example creating Simple and Basic Operation DataFrame

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("HandsOnMeeting3").getOrCreate()

data = [("Mike", "Sales", 3000),
        ("Ray", "Finance", 3000),
        ("Liam", "Sales", 4100),
        ("Reinhart", "Sales", 4600)]

columns = ["EmployeeName", "Department", "Salary"]

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|        Mike|     Sales|  3000|
|         Ray|   Finance|  3000|
|        Liam|     Sales|  4100|
|    Reinhart|     Sales|  4600|
+------------+----------+------+



In [None]:
# Example Transformation Operation DataFrame

df.select("EmployeeName", "Salary").show()
df.filter(df["Salary"] > 3000).show
df.groupBy("Department").mean("Salary").show()
df.groupBy("Department").max("Salary").show()
df.groupBy("Department").sum("Salary").show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|        Mike|  3000|
|         Ray|  3000|
|        Liam|  4100|
|    Reinhart|  4600|
+------------+------+

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3900.0|
|   Finance|     3000.0|
+----------+-----------+

+----------+-----------+
|Department|max(Salary)|
+----------+-----------+
|     Sales|       4600|
|   Finance|       3000|
+----------+-----------+

+----------+-----------+
|Department|sum(Salary)|
+----------+-----------+
|     Sales|      11700|
|   Finance|       3000|
+----------+-----------+



In [None]:
# Example manipulate complex data type
df = df.withColumn("SalaryBonus", df["Salary"] * 0.1)

df = df.withColumn("TotalCompensation", df["Salary"] + df["SalaryBonus"])

df.show()

+------------+----------+------+-----------+-----------------+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|
+------------+----------+------+-----------+-----------------+
|        Mike|     Sales|  3000|      300.0|           3300.0|
|         Ray|   Finance|  3000|      300.0|           3300.0|
|        Liam|     Sales|  4100|      410.0|           4510.0|
|    Reinhart|     Sales|  4600|      460.0|           5060.0|
+------------+----------+------+-----------+-----------------+



In [None]:
# Example using Window Functions

from pyspark.sql import Window
from pyspark.sql import functions as F

WindowSpec = Window.partitionBy("Department").orderBy("Salary")
df.withColumn("Rank", F.rank().over(WindowSpec)).show


In [33]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("IMDBProcessedDataset").getOrCreate()

csv_file_path = "/IMDB_processed_data.csv"

df = spark.read.csv(csv_file_path, header=True)

release_df = df.filter(df.Release == 2023)

print("Showing 20 Movies")
df.show(20)

print("\nShowing Movies Released in 2023")
release_df.show(df.count())


Showing 20 Movies
+----+--------------------+-------+-------+--------+-------+
|Rank|               Title|Release|Runtime|   Rated|Ratings|
+----+--------------------+-------+-------+--------+-------+
|   1|The Shawshank Red...|   1994| 2h 22m|       R|    9.3|
|   2|       The Godfather|   1972| 2h 55m|       R|    9.2|
|   3|     The Dark Knight|   2008| 2h 32m|   PG-13|      9|
|   4|The Godfather Par...|   1974| 3h 22m|       R|      9|
|   5|        12 Angry Men|   1957| 1h 36m|Approved|      9|
|   6|The Lord of the R...|   2003| 3h 21m|   PG-13|      9|
|   7|    Schindler's List|   1993| 3h 15m|       R|      9|
|   8|        Pulp Fiction|   1994| 2h 34m|       R|    8.9|
|   9|The Lord of the R...|   2001| 2h 58m|   PG-13|    8.9|
|  10|The Good, the Bad...|   1966| 2h 58m|       R|    8.8|
|  11|        Forrest Gump|   1994| 2h 22m|   PG-13|    8.8|
|  12|The Lord of the R...|   2002| 2h 59m|   PG-13|    8.8|
|  13|          Fight Club|   1999| 2h 19m|       R|    8.8|
|  14|