In [0]:
import requests
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType,FloatType

In [0]:
def read_file(fileformat,path,**options):
    df=spark.read.format(fileformat).options(**options).load(path)
    return df

In [0]:
df_customer=spark.read.csv("dbfs:/FileStore/Customer.csv",header=True)

In [0]:
dbutils.fs.rm('dbfs:/FileStore/Arch/sliver' , True)

Out[50]: True

In [0]:
df_product=spark.read.csv('dbfs:/FileStore/Products.csv',header=True)

In [0]:
def write_file(df,Type,path):
    df.write.format(Type).mode("overwrite").save(path)

In [0]:
df_product.write.parquet('dbfs:/FileStore/Arch/Bronze/product')

In [0]:
df_customer.write.parquet('dbfs:/FileStore/Arch/Bronze/Customer')

In [0]:
df_b_p=spark.read.parquet('dbfs:/FileStore/Arch/Bronze/product')

In [0]:
df_b_c=spark.read.parquet('dbfs:/FileStore/Arch/Bronze/Customer')

In [0]:
def null_per(df):
    schema = StructType([ \
        StructField("Column",StringType(),True), \
        StructField("Null_Percentage",StringType(),True)
    ])
    emptyRDD = spark.sparkContext.emptyRDD()
    resultdf = spark.createDataFrame([], schema=schema)
    for i in df.columns:
        df_count_null=df.select(col(i)).filter(col(i).isNull() | (col(i) == '')).count()
        df_nnul= spark.createDataFrame([[f'{i}',f"{df_count_null*100/df.count()} %" ]],schema=schema)
        resultdf=resultdf.union(df_nnul)
    return resultdf.orderBy(desc("Null_Percentage"))

In [0]:
null_per(df_b_c).display()

Column,Null_Percentage
CustomerID,0.0 %
ProductID (FK),0.0 %
Address,0.0 %
Phone,0.0 %
Email,0.0 %
Name,0.0 %


In [0]:
null_per(df_b_p).display()

Column,Null_Percentage
ProductID,0.0 %
Stock,0.0 %
Price,0.0 %
Description,0.0 %
ProductName,0.0 %


In [0]:
def select_columns(df,*colmns):
    return df.select(*colmns)

In [0]:
df_b_p.display()
df_Product_needed=df_b_p.select('ProductID','ProductName','Price')
df_Product_needed.display()
select_columns(df_b_p,'ProductID','ProductName','Price').display()

ProductID,ProductName,Description,Price,Stock
101,Widget A,High-quality widget,19.99,100
102,Widget B,Advanced widget with features,29.99,50
103,Widget C,Budget-friendly widget,14.99,200
104,Widget D,Widget for special use,39.99,25
105,Widget E,Compact and portable widget,24.99,75
106,Widget F,Wireless widget with Bluetooth,49.99,30
107,Widget G,Heavy-duty widget for industrial use,79.99,15
108,Widget H,Miniature pocket-sized widget,9.99,150


ProductID,ProductName,Price
101,Widget A,19.99
102,Widget B,29.99
103,Widget C,14.99
104,Widget D,39.99
105,Widget E,24.99
106,Widget F,49.99
107,Widget G,79.99
108,Widget H,9.99


ProductID,ProductName,Price
101,Widget A,19.99
102,Widget B,29.99
103,Widget C,14.99
104,Widget D,39.99
105,Widget E,24.99
106,Widget F,49.99
107,Widget G,79.99
108,Widget H,9.99


In [0]:
write_file(df_Product_needed,'parquet','dbfs:/FileStore/Arch/sliver/product')

In [0]:
df_b_c.display()
df_Customer_needed=df_b_c.select('CustomerID','Name','ProductID (FK)')
df_Customer_needed.display()

CustomerID,Name,Email,Phone,Address,ProductID (FK)
1,John Doe,johndoe@email.com,123-456-7890,123 Main St,101
2,Jane Smith,janesmith@email.com,987-654-3210,456 Elm St,102
3,Mark Johnson,markjohnson@email.com,555-123-7890,789 Oak St,101
4,Susan Lee,susanlee@email.com,777-888-9999,321 Pine St,104
5,David Wilson,davidwilson@email.com,444-555-6666,555 Birch St,103
6,Linda Brown,lindabrown@email.com,333-222-1111,678 Maple St,106
7,Michael Davis,michaeldavis@email.com,222-333-4444,987 Cedar St,107
8,Sarah White,sarahwhite@email.com,111-555-7777,222 Oak St,108
9,Chris Martin,chrismartin@email.com,777-666-5555,555 Elm St,101
10,Emily Thomas,emilythomas@email.com,444-888-2222,333 Pine St,102


CustomerID,Name,ProductID (FK)
1,John Doe,101
2,Jane Smith,102
3,Mark Johnson,101
4,Susan Lee,104
5,David Wilson,103
6,Linda Brown,106
7,Michael Davis,107
8,Sarah White,108
9,Chris Martin,101
10,Emily Thomas,102


In [0]:
def join_table(df1,df2,df1_common_column,df2_common_column,join_type):
    return df1.join(df2, df1[df1_common_column] == df2[df2_common_column],join_type)

In [0]:
joined_df = df_Customer_needed.join(df_Product_needed, df_Customer_needed["ProductID (FK)"] == df_Product_needed["ProductID"], "left")


In [0]:
joined_df.display()

CustomerID,Name,ProductID (FK),ProductID,ProductName,Price
1,John Doe,101,101,Widget A,19.99
2,Jane Smith,102,102,Widget B,29.99
3,Mark Johnson,101,101,Widget A,19.99
4,Susan Lee,104,104,Widget D,39.99
5,David Wilson,103,103,Widget C,14.99
6,Linda Brown,106,106,Widget F,49.99
7,Michael Davis,107,107,Widget G,79.99
8,Sarah White,108,108,Widget H,9.99
9,Chris Martin,101,101,Widget A,19.99
10,Emily Thomas,102,102,Widget B,29.99


In [0]:
def group_and_aggregate(df, group_columns, aggregation_columns):
    group_expr = [df[col] for col in group_columns]
    aggregation_expr = [agg(col).alias(f'total_{col}') for col, agg in aggregation_columns]
    grouped_df = df.groupBy(*group_expr).agg(*aggregation_expr)
    return grouped_df

In [0]:
group_columns = ['ProductID', 'ProductName']
aggregation_columns = [
    ('Price', sum),
    ('CustomerID', count)
]

In [0]:
grouped_df = group_and_aggregate(joined_df, group_columns, aggregation_columns)
grouped_df.display()

ProductID,ProductName,total_Price,total_CustomerID
104,Widget D,199.95,5
107,Widget G,399.95,5
105,Widget E,99.96,4
106,Widget F,249.95,5
101,Widget A,119.93999999999998,6
103,Widget C,74.95,5
102,Widget B,149.95,5
108,Widget H,49.95,5


In [0]:
grouped_df = joined_df.groupBy('ProductID', 'ProductName').agg(
    sum('Price').alias('total_price'),
    count('CustomerID').alias('total_customers')
)

In [0]:
grouped_df.display()

ProductID,ProductName,total_price,total_customers
104,Widget D,199.95,5
107,Widget G,399.95,5
105,Widget E,99.96,4
106,Widget F,249.95,5
101,Widget A,119.93999999999998,6
103,Widget C,74.95,5
102,Widget B,149.95,5
108,Widget H,49.95,5


In [0]:
def read_file(fileformat,path,**options):
    df=spark.read.format(fileformat).options(**options).load(path)
    return df

In [0]:
def null_per(df):
    schema = StructType([ \
        StructField("Column",StringType(),True), \
        StructField("Null_Percentage",StringType(),True)
    ])
    emptyRDD = spark.sparkContext.emptyRDD()
    resultdf = spark.createDataFrame([], schema=schema)
    for i in df.columns:
        df_count_null=df.select(col(i)).filter(col(i).isNull() | (col(i) == '')).count()
        df_nnul= spark.createDataFrame([[f'{i}',f"{df_count_null*100/df.count()} %" ]],schema=schema)
        resultdf=resultdf.union(df_nnul)
    return resultdf.orderBy(desc("Null_Percentage"))

In [0]:
def select_columns(df,*colmns):
    return df.select(*colmns)

In [0]:
def join_table(df1,df2,df1_common_column,df2_common_column,join_type):
    return df1.join(df2, df1[df1_common_column] == df2[df2_common_column],join_type)

In [0]:
def group_and_aggregate(df, group_columns, aggregation_columns):
    group_expr = [df[col] for col in group_columns]
    aggregation_expr = [agg(col).alias(f'total_{col}') for col, agg in aggregation_columns]
    grouped_df = df.groupBy(*group_expr).agg(*aggregation_expr)
    return grouped_df

In [0]:
def write_file(df,Type,path):
    df.write.format(Type).mode("overwrite").save(path)