In [0]:
display(dbutils.fs.ls("dbfs:/databricks-datasets/"))

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0


In [0]:
display(dbutils.fs.ls("dbfs:/databricks-datasets/Rdatasets/data-001/csv/sandwich/"))


path,name,size,modificationTime
dbfs:/databricks-datasets/Rdatasets/data-001/csv/sandwich/PublicSchools.csv,PublicSchools.csv,1074,1416619991000


In [0]:
df_preview = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/databricks-datasets/Rdatasets/data-001/csv/sandwich/PublicSchools.csv")

# Wyświetlenie schematu wykrytego przez Sparka
df_preview.printSchema()

# Wyświetlenie kilku pierwszych wierszy
df_preview.show(5)


root
 |-- _c0: string (nullable = true)
 |-- Expenditure: string (nullable = true)
 |-- Income: integer (nullable = true)

+----------+-----------+------+
|       _c0|Expenditure|Income|
+----------+-----------+------+
|   Alabama|        275|  6247|
|    Alaska|        821| 10851|
|   Arizona|        339|  7374|
|  Arkansas|        275|  6183|
|California|        387|  8850|
+----------+-----------+------+
only showing top 5 rows



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("State", StringType(), True),  
    StructField("Expenditure", IntegerType(), True),  
    StructField("Income", IntegerType(), True)
])


In [0]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load("dbfs:/databricks-datasets/Rdatasets/data-001/csv/sandwich/PublicSchools.csv")


df.printSchema()
df.show(5)

root
 |-- State: string (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Income: integer (nullable = true)

+----------+-----------+------+
|     State|Expenditure|Income|
+----------+-----------+------+
|   Alabama|        275|  6247|
|    Alaska|        821| 10851|
|   Arizona|        339|  7374|
|  Arkansas|        275|  6183|
|California|        387|  8850|
+----------+-----------+------+
only showing top 5 rows



In [0]:
corrupted_data = [
    ("Texas", "ERROR", "20000"),     
    ("New York", "350", "INVALID"),  
    ("Arizon4", "400", "7000")       
]


corrupted_schema = StructType([
    StructField("State", StringType(), True),
    StructField("Expenditure", StringType(), True),
    StructField("Income", StringType(), True)
])

corrupted_df = spark.createDataFrame(corrupted_data, schema=corrupted_schema)
corrupted_df.write.mode("overwrite").option("header", "true").csv("dbfs:/tmp/corrupted_schools.csv")


In [0]:
df_permissive = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "PERMISSIVE") \
    .schema(schema) \
    .load("dbfs:/tmp/corrupted_schools.csv")

print("PERMISSIVE MODE")
df_permissive.show()


PERMISSIVE MODE
+--------+-----------+------+
|   State|Expenditure|Income|
+--------+-----------+------+
|New York|        350|  null|
|   Texas|       null| 20000|
| Arizon4|        400|  7000|
+--------+-----------+------+



In [0]:
df_dropmalformed = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .schema(schema) \
    .load("dbfs:/tmp/corrupted_schools.csv")

print("DROPMALFORMED MODE")
df_dropmalformed.show()

DROPMALFORMED MODE
+-------+-----------+------+
|  State|Expenditure|Income|
+-------+-----------+------+
|Arizon4|        400|  7000|
+-------+-----------+------+



In [0]:
try:
    df_failfast = spark.read.format("csv") \
        .option("header", "true") \
        .option("mode", "FAILFAST") \
        .schema(schema) \
        .load("dbfs:/tmp/corrupted_schools.csv")

    print(" FAILFAST MODE")
    df_failfast.show()
except Exception as e:
    print(f" Błąd w trybie FAILFAST: {e}")

 FAILFAST MODE
 Błąd w trybie FAILFAST: An error occurred while calling o5344.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 129.0 failed 1 times, most recent failure: Lost task 0.0 in stage 129.0 (TID 340) (ip-10-172-162-99.us-west-2.compute.internal executor driver): com.databricks.sql.io.FileReadException: Error while reading file dbfs:/tmp/corrupted_schools.csv/part-00005-tid-7231755892523415623-cdb6ce25-50bf-4638-b6f3-ebdfa6ae420f-329-1-c000.csv.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.logFileNameAndThrow(FileScanRDD.scala:704)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:673)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:796)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.$anonfun$hasNext$1(FileScanRDD

In [0]:
df.write.mode("overwrite").parquet("dbfs:/tmp/schools_parquet")
df.write.mode("overwrite").json("dbfs:/tmp/schools_json")


In [0]:
df_parquet = spark.read.parquet("dbfs:/tmp/schools_parquet")
print("Dane Parquet:")
df_parquet.show(5)

Dane w formacie Parquet:
+----------+-----------+------+
|     State|Expenditure|Income|
+----------+-----------+------+
|   Alabama|        275|  6247|
|    Alaska|        821| 10851|
|   Arizona|        339|  7374|
|  Arkansas|        275|  6183|
|California|        387|  8850|
+----------+-----------+------+
only showing top 5 rows



In [0]:
df_json = spark.read.json("dbfs:/tmp/schools_json")
print("Dane JSON:")
df_json.show(5)

Dane JSON:
+-----------+------+----------+
|Expenditure|Income|     State|
+-----------+------+----------+
|        275|  6247|   Alabama|
|        821| 10851|    Alaska|
|        339|  7374|   Arizona|
|        275|  6183|  Arkansas|
|        387|  8850|California|
+-----------+------+----------+
only showing top 5 rows

