In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
#----zadanie 2--------------
# Tworzenie schematu dla pliku movies.csv
movies_schema = StructType([
    StructField("title", StringType(), True),            
    StructField("year", IntegerType(), True),          
    StructField("genre", StringType(), True),           
    StructField("country", StringType(), True),          
    StructField("date_published", StringType(), True),   
    StructField("duration", IntegerType(), True),        
    StructField("director", StringType(), True),       
    StructField("avg_vote", DoubleType(), True)         
])

# Ścieżka do pliku CSV w Databricks
filePath = "dbfs:/FileStore/tables/Files/movies.csv"

# Wczytanie danych do DataFrame przy użyciu wcześniej zdefiniowanego schematu
moviesDf = spark.read.format("csv") \
            .option("header","true") \
            .schema(movies_schema) \
            .load(filePath)

# Wyświetlenie danych
display(moviesDf)

# zadanie 3
corrupted_file_path = "dbfs:/FileStore/tables/Files/movies_corrupted.csv"

corrupted_data = """title,year,genre,country,date_published,duration,director,avg_vote
Inception,2010,Sci-Fi,USA,2010-07-16,148,Christopher Nolan,8.8
Titanic,1997,Drama,USA,1997-12-19,195,James Cameron,7.8
Fake Movie,ERROR,Action,ERROR,2024-01-01,WRONG,Unknown,BAD_DATA
"""

# Zapisujemy błędne dane do pliku w Databricks
dbutils.fs.put(corrupted_file_path, corrupted_data, True)

# Tryb PERMISSIVE (domyślny) - błędne dane zamienia na NULL
df_permissive = spark.read.format("csv") \
    .option("header", "true") \
    .schema(movies_schema) \
    .option("mode", "PERMISSIVE") \
    .load(corrupted_file_path)

display(df_permissive)

# Tryb DROPMALFORMED - usuwa całe błędne wiersze
df_dropmalformed = spark.read.format("csv") \
    .option("header", "true") \
    .schema(movies_schema) \
    .option("mode", "DROPMALFORMED") \
    .load(corrupted_file_path)

display(df_dropmalformed)

# Tryb FAILFAST - zatrzymuje się na błędzie
df_failfast = spark.read.format("csv") \
        .option("header", "true") \
        .schema(movies_schema) \
        .option("mode", "FAILFAST") \
        .load(corrupted_file_path)


# zadanie 4
# Ścieżka do zapisania pliku Parquet
parquet_path = "dbfs:/FileStore/tables/Files/movies_parquet"

# Zapis DataFrame do Parquet
df_permissive.write.mode("overwrite").parquet(parquet_path)

# Odczytanie pliku Parquet do nowego DataFrame
df_parquet = spark.read.parquet(parquet_path)

# Wyświetlenie danych
display(df_parquet)

#-------------------------
# Ścieżka do zapisania pliku JSON
json_path = "dbfs:/FileStore/tables/Files/movies_json"

# Zapis DataFrame do JSON
df_permissive.write.mode("overwrite").json(json_path)

# Odczytanie pliku JSON do nowego DataFrame
df_json = spark.read.json(json_path)

# Wyświetlenie danych
display(df_json)





title,year,genre,country,date_published,duration,director,avg_vote
tt0000009,,Miss Jerry,1894,1894-10-09,,45,
tt0000574,,The Story of the Kelly Gang,1906,26.12.1906,,70,
tt0001892,,Den sorte drøm,1911,19.08.1911,,53,
tt0002101,,Cleopatra,1912,13.11.1912,,100,
tt0002130,,L'Inferno,1911,06.03.1911,,68,
tt0002199,,"From the Manger to the Cross or, Jesus of Nazareth",1912,1913,,60,
tt0002423,,Madame DuBarry,1919,26.11.1919,,85,
tt0002445,,Quo Vadis?,1913,01.03.1913,,120,
tt0002452,,Independenta Romaniei,1912,01.09.1912,,120,
tt0002461,,Richard III,1912,15.10.1912,,55,


Wrote 250 bytes.


title,year,genre,country,date_published,duration,director,avg_vote
Inception,2010.0,Sci-Fi,USA,2010-07-16,148.0,Christopher Nolan,8.8
Titanic,1997.0,Drama,USA,1997-12-19,195.0,James Cameron,7.8
Fake Movie,,Action,ERROR,2024-01-01,,Unknown,


title,year,genre,country,date_published,duration,director,avg_vote
Inception,2010,Sci-Fi,USA,2010-07-16,148,Christopher Nolan,8.8
Titanic,1997,Drama,USA,1997-12-19,195,James Cameron,7.8


title,year,genre,country,date_published,duration,director,avg_vote
Inception,2010.0,Sci-Fi,USA,2010-07-16,148.0,Christopher Nolan,8.8
Titanic,1997.0,Drama,USA,1997-12-19,195.0,James Cameron,7.8
Fake Movie,,Action,ERROR,2024-01-01,,Unknown,


avg_vote,country,date_published,director,duration,genre,title,year
8.8,USA,2010-07-16,Christopher Nolan,148.0,Sci-Fi,Inception,2010.0
7.8,USA,1997-12-19,James Cameron,195.0,Drama,Titanic,1997.0
,ERROR,2024-01-01,Unknown,,Action,Fake Movie,
