In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install delta-spark==3.2.0 -q
import pyspark
from delta import *
from pyspark.sql.functions import *

# Create a SparkSession with Delta Lake extensions
# The '.config(...)' lines are crucial for enabling Delta Lake's features
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Get or create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark and Delta Lake are ready!")

Spark and Delta Lake are ready!


In [3]:
# LOADED CLEANED FILE
file_path = "/content/drive/MyDrive/data engineering/Week 4/cleaned_students_progress.csv"

df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

df.show()

# LOADING STUDENTS FILE
file_path_s = "/content/drive/MyDrive/data engineering/Week 4/students.csv"

df_s = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path_s)

df_s.show()

# LOADING COURSE FILE
file_path_c = "/content/drive/MyDrive/data engineering/Week 4/courses.csv"

df_c = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path_c)

df_c.show(truncate=False)

+-----------+-------------+---------------------+------------+----------+---------+---------------+
|progress_id|enrollment_id|completion_percentage|last_updated|student_id|course_id|enrollment_date|
+-----------+-------------+---------------------+------------+----------+---------+---------------+
|          1|            1|                 85.0|  2024-02-01|         1|      101|     2024-01-10|
|          2|            2|                 50.0|  2024-02-02|         2|      102|     2024-01-30|
|          3|            3|                 50.0|  2024-02-03|         3|      103|     2024-01-15|
|          4|            4|                100.0|  2024-02-15|         4|      101|     2024-01-18|
|          5|            5|                 70.0|  2024-02-05|         5|      102|     2024-01-20|
+-----------+-------------+---------------------+------------+----------+---------+---------------+

+----------+------------+----------------+---------------+
|student_id|student_name|   student_emai

In [5]:
# Join progress + students
merged_df = df.join(df_s, on="student_id", how="inner")
merged_df.show()

# Join with courses
final_df = merged_df.join(df_c, on="course_id", how="inner")
print("------------\n")
final_df.show()

+----------+-----------+-------------+---------------------+------------+---------+---------------+------------+----------------+---------------+
|student_id|progress_id|enrollment_id|completion_percentage|last_updated|course_id|enrollment_date|student_name|   student_email|student_college|
+----------+-----------+-------------+---------------------+------------+---------+---------------+------------+----------------+---------------+
|         1|          1|            1|                 85.0|  2024-02-01|      101|     2024-01-10|        AMIT|  amit@gmail.com|            KCT|
|         2|          2|            2|                 50.0|  2024-02-02|      102|     2024-01-30|       ROHIT| rohit@gmail.com|           MCET|
|         3|          3|            3|                 50.0|  2024-02-03|      103|     2024-01-15|      LOKESH|lokesh@gmail.com|            KPR|
|         4|          4|            4|                100.0|  2024-02-15|      101|     2024-01-18|       RAMYA| ramya@gmail

In [7]:
final_df2 = final_df.select("student_name", "course_name", "enrollment_date", "completion_percentage")

print("------------ FINAL DF -------------\n")
final_df2.show(truncate=False)

------------ FINAL DF -------------

+------------+-----------+---------------+---------------------+
|student_name|course_name|enrollment_date|completion_percentage|
+------------+-----------+---------------+---------------------+
|AMIT        |FSD        |2024-01-10     |85.0                 |
|ROHIT       |EMBEDDED   |2024-01-30     |50.0                 |
|LOKESH      |ML         |2024-01-15     |50.0                 |
|RAMYA       |FSD        |2024-01-18     |100.0                |
|PRIYA       |EMBEDDED   |2024-01-20     |70.0                 |
+------------+-----------+---------------+---------------------+



In [10]:
drive_csv_path = "/content/drive/MyDrive/data engineering/Week 4/final_df2.csv"
drive_delta_path = "/content/drive/MyDrive/data engineering/Week 4/final_df2_delta"

# Save as CSV
final_df2.coalesce(1).write.mode("overwrite").option("header", "true").csv(drive_csv_path)

# Save as Delta
final_df2.write.format("delta").mode("overwrite").save(drive_delta_path)


In [12]:
from google.colab import files
import glob, shutil

# Path where Spark saved the CSV folder
drive_csv_path = "/content/drive/MyDrive/data engineering/Week 4/final_df2.csv"

# Find the actual part file inside
csv_files = glob.glob(drive_csv_path + "/part-*.csv")

# Copy to Colab local storage with a nice name
local_csv = "/content/final_df2.csv"
shutil.copy(csv_files[0], local_csv)

# Download to your system
files.download(local_csv)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>