### Defining Schema Structure in Databricks

In [0]:
%sql
CREATE CATALOG if not exists telecom_catalog_assign;
CREATE SCHEMA IF NOT EXISTS telecom_catalog_assign.landing_zone;
CREATE VOLUME IF NOT EXISTS telecom_catalog_assign.landing_zone.landing_vol;

In [0]:
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/")
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/")

In [0]:
customer_csv = '''101,Arun,31,Chennai,PREPAID
102,Meera,45,Bangalore,POSTPAID
103,Irfan,29,Hyderabad,PREPAID
104,Raj,52,Mumbai,POSTPAID
105,,27,Delhi,PREPAID
106,Sneha,abc,Pune,PREPAID'''

usage_tsv = '''customer_id\tvoice_mins\tdata_mb\tsms_count
101\t320\t1500\t20
102\t120\t4000\t5
103\t540\t600\t52
104\t45\t200\t2
105\t0\t0\t0'''

tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp
5001|101|TWR01|-80|2025-01-10 10:21:54
5004|104|TWR05|-75|2025-01-10 11:01:12'''

In [0]:
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv", customer_csv, True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv", usage_tsv, True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_logs_region1.csv", tower_logs_region1, True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/tower_logs_region2.csv", tower_logs_region1, True)

### Read Ops

In [0]:
tower_multiple_path=spark.read.csv(path="/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/*",inferSchema=True,header=True)
tower_multiple_path.show(100,truncate=False)

In [0]:
df1_multiple_files=spark.read.csv(path=["/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/",
                                        "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region2/"],inferSchema=True,
                                  pathGlobFilter="tower_logs_region*.csv",header=True)
df1_multiple_files.show(truncate=False)

In [0]:
df1_multiple_files=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/",inferSchema=True,
                                  recursiveFileLookup=True,pathGlobFilter="tower_logs_region1*.csv",header=True)
df1_multiple_files.show(truncate=False)

In [0]:
customer_df = spark.read.format("csv").option("inferSchema","true").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv")
customer_df.show()

In [0]:
customer_df = spark.read.format("csv").options(inferSchema="true",header="true").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv").toDF("id","name","age","city","plan")
customer_df.show()

In [0]:
usage_df = spark.read.format("csv").options(inferSchema="true",header="true",sep="\t").load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv")
usage_df.show()

In [0]:
usage_schema = ["customer_id","voice_mins","data_mb","sms_count"]
usage_df = spark.read.format("csv").options(inferSchema="true",header="true",sep="\t",schema =usage_schema).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv")
usage_df.show()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
tower_schema = StructType([
    StructField("event_id", IntegerType(), True),
    StructField("customer_id", StringType(), True),
    StructField("tower_id", StringType(), True),
    StructField("signal_strength", StringType(), True),
    StructField("timestamp", StringType(), True)
])

tower_df = spark.read.format("csv").options(inferSchema="false",header="true",sep="|",schema =tower_schema).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/region1/tower_logs_region1.csv")
tower_df.show()