In [0]:
file_path = "dbfs:/FileStore/tables/ipl_2021_matches.csv"


In [0]:
raw_df = spark.read.option("header", True).csv(file_path)
raw_df.show(5)
raw_df.printSchema()


+----+-----------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|         series_name|match_no|match_type|          match_name|          match_href|match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-----------+--------------------+--------+----------+--------------------+--------------------+-----------+-----------+--------------------+--------------+-----------+
|2021|        T20|Indian Premier Le...|    null|    League|MUMBAI INDIANS vs...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|        T20|Indian Premier Le...|    null|    League|CHENNAI SUPER KIN...|https://www.cricb...|       null|       null|                null|          null|       null|
|2021|        T20|Indian Premier Le...|    null|    League|SUNRISERS HYDERAB...|https://www.cricb...|       null|       null|     

In [0]:
string_schema = """
match_id STRING,
season STRING,
start_date STRING,
venue STRING,
innings STRING,
ball STRING,
batting_team STRING,
bowling_team STRING,
striker STRING,
non_striker STRING,
bowler STRING,
runs_off_bat INT,
extras INT,
wickets INT
"""
ipl_df_string_schema = spark.read \
    .option("header", True) \
    .schema(string_schema) \
    .csv(file_path)

ipl_df_string_schema.show(5)
ipl_df_string_schema.printSchema()


+--------+------+--------------------+-----+-------+--------------------+--------------------+------------+-------+-----------+------+------------+------+-------+
|match_id|season|          start_date|venue|innings|                ball|        batting_team|bowling_team|striker|non_striker|bowler|runs_off_bat|extras|wickets|
+--------+------+--------------------+-----+-------+--------------------+--------------------+------------+-------+-----------+------+------------+------+-------+
|    2021|   T20|Indian Premier Le...| null| League|MUMBAI INDIANS vs...|https://www.cricb...|        null|   null|       null|  null|        null|  null|   null|
|    2021|   T20|Indian Premier Le...| null| League|CHENNAI SUPER KIN...|https://www.cricb...|        null|   null|       null|  null|        null|  null|   null|
|    2021|   T20|Indian Premier Le...| null| League|SUNRISERS HYDERAB...|https://www.cricb...|        null|   null|       null|  null|        null|  null|   null|
|    2021|   T20|India

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

struct_schema = StructType([
    StructField("match_id", StringType(), True),
    StructField("season", StringType(), True),
    StructField("start_date", StringType(), True),
    StructField("venue", StringType(), True),
    StructField("innings", StringType(), True),
    StructField("ball", StringType(), True),
    StructField("batting_team", StringType(), True),
    StructField("bowling_team", StringType(), True),
    StructField("striker", StringType(), True),
    StructField("non_striker", StringType(), True),
    StructField("bowler", StringType(), True),
    StructField("runs_off_bat", IntegerType(), True),
    StructField("extras", IntegerType(), True),
    StructField("wickets", IntegerType(), True)
])

ipl_df_struct_schema = spark.read \
    .option("header", True) \
    .schema(struct_schema) \
    .csv(file_path)

ipl_df_struct_schema.show(5)
ipl_df_struct_schema.printSchema()


+--------+------+--------------------+-----+-------+--------------------+--------------------+------------+-------+-----------+------+------------+------+-------+
|match_id|season|          start_date|venue|innings|                ball|        batting_team|bowling_team|striker|non_striker|bowler|runs_off_bat|extras|wickets|
+--------+------+--------------------+-----+-------+--------------------+--------------------+------------+-------+-----------+------+------------+------+-------+
|    2021|   T20|Indian Premier Le...| null| League|MUMBAI INDIANS vs...|https://www.cricb...|        null|   null|       null|  null|        null|  null|   null|
|    2021|   T20|Indian Premier Le...| null| League|CHENNAI SUPER KIN...|https://www.cricb...|        null|   null|       null|  null|        null|  null|   null|
|    2021|   T20|Indian Premier Le...| null| League|SUNRISERS HYDERAB...|https://www.cricb...|        null|   null|       null|  null|        null|  null|   null|
|    2021|   T20|India

In [0]:
ipl_df_struct_schema.toPandas()


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wickets
0,2021,T20,Indian Premier League 2021,,League,"MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU,...",https://www.cricbuzz.com/cricket-scores/35612/...,,,,,,,
1,2021,T20,Indian Premier League 2021,,League,"CHENNAI SUPER KINGS vs DELHI CAPITALS, 2nd Match",https://www.cricbuzz.com/cricket-scores/35617/...,,,,,,,
2,2021,T20,Indian Premier League 2021,,League,"SUNRISERS HYDERABAD vs KOLKATA KNIGHT RIDERS, ...",https://www.cricbuzz.com/cricket-scores/35618/...,,,,,,,
3,2021,T20,Indian Premier League 2021,,League,"RAJASTHAN ROYALS vs PUNJAB KINGS, 4th Match",https://www.cricbuzz.com/cricket-scores/35622/...,,,,,,,
4,2021,T20,Indian Premier League 2021,,League,"KOLKATA KNIGHT RIDERS vs MUMBAI INDIANS, 5th M...",https://www.cricbuzz.com/cricket-scores/35627/...,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2021,T20,Indian Premier League 2021,,League,"ROYAL CHALLENGERS BENGALURU vs DELHI CAPITALS,...",https://www.cricbuzz.com/cricket-scores/37661/...,,,,,,,
58,2021,T20,Indian Premier League 2021,,League,"DELHI CAPITALS vs CHENNAI SUPER KINGS, Qualifi...",https://www.cricbuzz.com/cricket-scores/37666/...,,,,,,,
59,2021,T20,Indian Premier League 2021,,League,ROYAL CHALLENGERS BENGALURU vs KOLKATA KNIGHT ...,https://www.cricbuzz.com/cricket-scores/37671/...,,,,,,,
60,2021,T20,Indian Premier League 2021,,League,"DELHI CAPITALS vs KOLKATA KNIGHT RIDERS, Quali...",https://www.cricbuzz.com/cricket-scores/37672/...,,,,,,,


In [0]:
total_rows = ipl_df_struct_schema.count()
ipl_df_struct_schema.show(total_rows, truncate=False)


+--------+------+--------------------------+-----+-------+----------------------------------------------------------------+-----------------------------------------------------------------------------------------------+------------+-------+-----------+------+------------+------+-------+
|match_id|season|start_date                |venue|innings|ball                                                            |batting_team                                                                                   |bowling_team|striker|non_striker|bowler|runs_off_bat|extras|wickets|
+--------+------+--------------------------+-----+-------+----------------------------------------------------------------+-----------------------------------------------------------------------------------------------+------------+-------+-----------+------+------------+------+-------+
|2021    |T20   |Indian Premier League 2021|null |League |MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match        |https://www.c

In [0]:
from pyspark.sql.functions import col, isnan
ipl_df_struct_schema.select([col(c).isNull().alias(c) for c in ipl_df_struct_schema.columns]).show(5)
ipl_df_struct_schema.filter(col("match_id").isNull()).show()


+--------+------+----------+-----+-------+-----+------------+------------+-------+-----------+------+------------+------+-------+
|match_id|season|start_date|venue|innings| ball|batting_team|bowling_team|striker|non_striker|bowler|runs_off_bat|extras|wickets|
+--------+------+----------+-----+-------+-----+------------+------------+-------+-----------+------+------------+------+-------+
|   false| false|     false| true|  false|false|       false|        true|   true|       true|  true|        true|  true|   true|
|   false| false|     false| true|  false|false|       false|        true|   true|       true|  true|        true|  true|   true|
|   false| false|     false| true|  false|false|       false|        true|   true|       true|  true|        true|  true|   true|
|   false| false|     false| true|  false|false|       false|        true|   true|       true|  true|        true|  true|   true|
|   false| false|     false| true|  false|false|       false|        true|   true|       t

In [0]:
ipl_df_struct_schema.write.mode("overwrite").parquet("dbfs:/FileStore/ipl/cleaned_ipl_2021")
