**Schema Creation:**  
A new schema was created under the Unity Catalog (`kusha_solutions.Jeevan`) to organize all data tables.

**Volume Creation:**  
A dedicated volume was established within the schema to store raw CSV files.

**Data Generation Using Faker:**  
The Faker Python library was utilized to generate synthetic student-related data, with intentional introduction of null values and duplicate records for subsequent data cleaning exercises.

**CSV File Creation:**  
Four interlinked CSV files were generated to represent a student database system.

**Data Storage:**  
All CSV files were stored in the Unity Catalog volume at:  
`/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/`

In [0]:
# Create the schema 'Jeevan' in the 'kusha_solutions' catalog if it does not already exist
spark.sql("""
    CREATE SCHEMA IF NOT EXISTS kusha_solutions.Jeevan
""")

In [0]:
# Create the volume 'My_volume' in the 'Jeevan' schema under the 'kusha_solutions' catalog if it does not already exist
spark.sql("""
    CREATE VOLUME IF NOT EXISTS kusha_solutions.Jeevan.My_volume
    """)

In [0]:
# ======================================================
# STEP 1: Install and import required libraries
# ======================================================
%pip install faker
from faker import Faker  # Library to generate fake data
import pandas as pd      # For data manipulation and CSV writing
import random            # For random selections
import os                # For file path operations

# Initialize Faker instance for generating fake data
fake = Faker()

# Define the path to save the students CSV file in Unity Catalog Volume
volume_path = "/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/students/"

# ======================================================
# STEP 2: Generate fake students data
# ======================================================
students = []
for i in range(500):
        students.append({
            "student_id": i,  # Unique student ID
            "name": fake.name(),  # Randomly generated name
            "email": fake.email(),  # Randomly generated email
            "age": random.choice([random.randint(18, 25), None]),  # Random age or None
            "city": random.choice([fake.city(), None]),  # Random city or None
            "department": random.choice(["CSE", "ECE", "MECH", "CIVIL", "EEE"])  # Random department
        })

# Convert the list of students to a DataFrame
df = pd.DataFrame(students)

# Add 5% duplicate rows to the DataFrame
df = pd.concat([df, df.sample(frac=0.05, replace=True)], ignore_index=True)  # Add duplicates

# Define the output CSV file name
file_name = "students.csv"

# Construct the full output path for the CSV file
output_path = f"{volume_path}/{file_name}"

# Save the DataFrame as a CSV file at the specified path
df.to_csv(output_path, index=False)

# Print confirmation messages with file path and row count
print(f"âœ… File '{file_name}' created successfully at: {output_path}")
print(f"ðŸ§¾ Total Rows: {len(df)} (includes nulls + duplicates)")

In [0]:
# Read the students CSV files from the specified Unity Catalog volume path into a Spark DataFrame
students_df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/students/students.csv")

# Display the DataFrame in a tabular format
display(students_df)

In [0]:
# ======================================================
# STEP 1: Install and import required libraries
# ======================================================
%pip install faker
from faker import Faker  # Library to generate fake data (not used here, but for consistency)
import pandas as pd      # For data manipulation and CSV writing
import random            # For random selections
import os                # For file path operations

# Initialize Faker instance (not used in this cell, but for consistency)
fake = Faker()

# ======================================================
# STEP 2: Define the path to save the courses CSV file in Unity Catalog Volume
# ======================================================
volume_path = "/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/courses/"

# ======================================================
# STEP 3: Create a list of course dictionaries
# ======================================================
courses = [
        {"course_id": "C101", "course_name": "Python", "credits": 3},
        {"course_id": "C102", "course_name": "Data Science", "credits": 4},
        {"course_id": "C103", "course_name": "AI & ML", "credits": 4},
        {"course_id": "C104", "course_name": "Database Systems", "credits": 3},
        {"course_id": "C105", "course_name": "Computer Networks", "credits": 3},
    ]

# Convert the list of courses to a DataFrame
df = pd.DataFrame(courses)

# Add 10% duplicate rows to the DataFrame
df = pd.concat([df, df.sample(frac=0.1, replace=True)], ignore_index=True)

# Define the output CSV file name
file_name = "courses.csv"

# Construct the full output path for the CSV file
output_path = f"{volume_path}/{file_name}"

# Save the DataFrame as a CSV file at the specified path
df.to_csv(output_path, index=False)

# Print confirmation messages with file path and row count
print(f"âœ… File '{file_name}' created successfully at: {output_path}")
print(f"ðŸ§¾ Total Rows: {len(df)} (includes nulls + duplicates)")

In [0]:
# ======================================================
# STEP 1: Import required libraries for data generation and file operations
# ======================================================
from faker import Faker
import pandas as pd
import random
import os

fake = Faker()

# ======================================================
# STEP 2: Set the path where the enrollments CSV will be saved
# ======================================================
volume_path = "/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/enrollments"
os.makedirs(volume_path, exist_ok=True)  # Create the directory if it doesn't exist

# ======================================================
# STEP 3: Load existing students and courses data from CSV files
# ======================================================
students_df = pd.read_csv("/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/students/students.csv")
courses_df = pd.read_csv("/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/courses/courses.csv")

# ======================================================
# STEP 4: Generate fake enrollments data with unique enrollment IDs
# ======================================================

enrollments = []

for i in range(1, 550):
    student = students_df.sample(1).iloc[0]  # Randomly select a student
    course = courses_df.sample(1).iloc[0]    # Randomly select a course
    enrollments.append({
        "enrollment_id": i,  # Unique sequential enrollment ID
        "student_id": student["student_id"],  # Reference to student
        "course_id": course["course_id"],     # Reference to course
        "enroll_date": fake.date_this_year().strftime("%Y-%m-%d"),  # Random enrollment date in current year
        "status": random.choice(["Active", "Completed", "Dropped", None])  # Random status, some nulls
    })

df = pd.DataFrame(enrollments)

# Add 5% duplicate rows to simulate real-world data issues
df = pd.concat([df, df.sample(frac=0.05, replace=True)], ignore_index=True)

# ======================================================
# STEP 5: Save the generated enrollments data as a CSV file
# ======================================================
file_name = "enrollments.csv"
output_path = f"{volume_path}/{file_name}"

df.to_csv(output_path, index=False)  # Write DataFrame to CSV
print(f"âœ… CSV '{file_name}' created successfully at: {output_path}")
print(f"ðŸ§¾ Total Rows: {len(df)} (includes nulls + duplicates)")

In [0]:
# Read the enrollments CSV file from the specified Unity Catalog volume path into a Spark DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/enrollments/enrollments.csv")

# Display the DataFrame in a tabular format
display(df)

In [0]:
# Read the courses CSV file from the specified Unity Catalog volume path into a Spark DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/courses/courses.csv")

# Display the DataFrame in a tabular format
display(df)

In [0]:
# ======================================================
# STEP 1: Import libraries needed for data generation and file operations
# ======================================================
%pip install faker
from faker import Faker
import pandas as pd
import random
import os

fake = Faker()

# ======================================================
# STEP 2: Define the path where the results CSV will be saved in the Unity Catalog volume
# ======================================================
volume_path = "/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/results"

# ======================================================
# STEP 3: Read the enrollments CSV file to get existing enrollment records
# ======================================================
enrollments_df = pd.read_csv("/Volumes/kusha_solutions/jeevan/my_volume/csv_data/raw/enrollments/enrollments.csv")

# ======================================================
# STEP 4: Generate a fake results dataset linked to enrollments
# ======================================================
num_results = 400  # Number of result records to generate
results = []

# Randomly sample enrollments to assign results to
sampled_enrollments = enrollments_df.sample(num_results, replace=False)  # Pick random enrollments

# Create result records with random marks and grades, some with nulls
for i, row in enumerate(sampled_enrollments.itertuples(), start=1):
    results.append({
        "result_id": i,  # Sequential unique result ID
        "enrollment_id": row.enrollment_id,  # Reference to enrollment
        "marks": random.choice([round(random.uniform(30, 100), 2), None]),  # Random marks or None
        "grade": random.choice(["A", "B", "C", "D", "F", None])  # Random grade or None
    })

df = pd.DataFrame(results)

# Add 10% duplicate rows to simulate real-world data issues
df = pd.concat([df, df.sample(frac=0.1, replace=True)], ignore_index=True)

# ======================================================
# STEP 5: Save the generated results data as a CSV file in the specified volume path
# ======================================================
file_name = "results.csv"
output_path = f"{volume_path}/{file_name}"

df.to_csv(output_path, index=False)

# Print confirmation messages with file path and row count
print(f"âœ… CSV '{file_name}' created successfully at: {output_path}")
print(f"ðŸ§¾ Total Rows: {len(df)} (includes nulls + duplicates)")