### Defining Schema Structure in Databricks

In [0]:
%sql
CREATE CATALOG if not exists telecom_catalog_assign;
CREATE SCHEMA IF NOT EXISTS telecom_catalog_assign.landing_zone;
CREATE VOLUME IF NOT EXISTS telecom_catalog_assign.landing_zone.landing_vol;

In [0]:
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/")

In [0]:
customer_csv = '''101,Arun,31,Chennai,PREPAID
102,Meera,45,Bangalore,POSTPAID
103,Irfan,29,Hyderabad,PREPAID
104,Raj,52,Mumbai,POSTPAID
105,,27,Delhi,PREPAID
106,Sneha,abc,Pune,PREPAID'''

usage_tsv = '''customer_id\tvoice_mins\tdata_mb\tsms_count
101\t320\t1500\t20
102\t120\t4000\t5
103\t540\t600\t52
104\t45\t200\t2
105\t0\t0\t0'''

tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp
5001|101|TWR01|-80|2025-01-10 10:21:54
5004|104|TWR05|-75|2025-01-10 11:01:12'''

In [0]:
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv", customer_csv, True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv", usage_tsv, True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_logs_region1.csv", tower_logs_region1, True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/tower_logs_region2.csv", tower_logs_region1, True)

### Read Ops

In [0]:
tower_multiple_path=spark.read.csv(path="/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/*",inferSchema=True,header=True)
tower_multiple_path.show(100,truncate=False)

In [0]:
tower_df_recur=spark.read.csv(path=["/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/",
                                        "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/"],inferSchema=True,
                                  pathGlobFilter="tower_logs_region*.csv",header=True,sep='|')
tower_df_recur.show(truncate=False)

In [0]:
df1_multiple_files=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/",inferSchema=True,
                                  recursiveFileLookup=True,pathGlobFilter="tower_logs_region1*.csv",header=True)
df1_multiple_files.show(truncate=False)

In [0]:
customer_df = spark.read.format("csv").option("inferSchema","true").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv")
customer_df.show()

In [0]:
customer_df = spark.read.format("csv").options(inferSchema="true",header="true").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv").toDF("id","name","age","city","plan")
customer_df.show()

In [0]:
usage_df = spark.read.format("csv").options(inferSchema="true",header="true",sep="\t").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv")
usage_df.show()

In [0]:
usage_schema = ["customer_id","voice_mins","data_mb","sms_count"]
usage_df = spark.read.format("csv").options(inferSchema="true",header="true",sep="\t",schema =usage_schema).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv")
usage_df.show()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
tower_schema = StructType([
    StructField("event_id", IntegerType(), True),
    StructField("customer_id", StringType(), True),
    StructField("tower_id", StringType(), True),
    StructField("signal_strength", StringType(), True),
    StructField("timestamp", StringType(), True)
])

tower_df = spark.read.format("csv").options(inferSchema="false",header="true",sep="|",schema =tower_schema).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_logs_region1.csv")
tower_df.show()

In [0]:
customer_df.coalesce(1).write.mode("overwrite").csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata/")
usage_df.write.mode("append").csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/usage_targetdata/")
tower_df_recur.write.mode("overwrite").option('header','true').csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata/")

In [0]:
spark.read.option("header", "true").option('sep','|').csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata/").show(5)

In [0]:
customer_df.coalesce(1).write.mode("overwrite").json("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata_json/")
usage_df.coalesce(1).write.mode("append").option('compression','snappy').json("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/usage_targetdata_json/")
tower_df_recur.write.mode("overwrite").json("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_json/")

In [0]:
spark.read.option("header", "true").json("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_json/").show(5)

In [0]:
customer_df.write.mode("overwrite").option("compression","gzip").parquet("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata_parquet/")
usage_df.write.mode("error").parquet("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/usage_targetdata_parquet/")
tower_df.write.mode("overwrite").option("compression","gzip").parquet("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_parquet/")


In [0]:
spark.read.format("parquet").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_parquet/").show(truncate=False)

In [0]:
customer_df.write.mode("overwrite").orc("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata_orc/")
usage_df.write.mode("append").orc("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/usage_targetdata_orc/")
tower_df.write.mode("overwrite").orc("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_orc/")

In [0]:
 spark.read.format("orc").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_orc/").show()

In [0]:
customer_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata_delta/")

In [0]:
usage_df.write \
    .format("delta") \
    .mode("append") \
    .save("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/usage_targetdata_delta/")

In [0]:
tower_df.write.format("delta").mode("overwrite").save("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/tower_targetdata_delta/") 

In [0]:
spark.read.format("delta").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata_delta/").show(5)

In [0]:
customer_df.write.saveAsTable("default.customertbl",mode='overwrite')
spark.read.table('default.customertbl').show(3)

In [0]:
usage_df.write.saveAsTable("default.usagetable",mode='overwrite')
spark.read.table('default.usagetable').show(3)
tower_df.write.saveAsTable("default.towertbl",mode='overwrite')
spark.read.table('default.tower_tbl')

In [0]:
%sql
drop table default.usagetable;
drop table default.towertbl; 
drop table default.customertbl;

In [0]:
%sql
create table if not exists default.customertbl(id int, name string, age string, city string, plan string);
create table if not exists default.towertbl(event_id string, customer_id string, tower_id string, signal_strength string, timestamp string);
create table if not exists default.usagetable(customer_id int, voice_mins int, data_mb int, sms_count int);


In [0]:
customer_df.write.insertInto("customertbl",overwrite=True)
spark.read.table('customertbl').show(3)
usage_df.write.insertInto("usagetable",overwrite=True)
spark.read.table('usagetable').show(3)
tower_df.write.insertInto("towertbl",overwrite=True)
spark.read.table('towertbl').show(3)

In [0]:
customer_df.write.xml("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/cust_targetdata_xml/",mode='overwrite',rowTag='customer')

In [0]:
usage_df.write.xml("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/target/usage_targetdata_xml/",mode='append',rowTag='usage')