Initialise The PySpark Environment

In [None]:
# Create and initialise a PySpark session using the PySpark API
import os
from pyspark.sql import SparkSession # type: ignore
from pyspark.sql.types import StringType,IntegerType,StructType,StructField # type: ignore
spark = SparkSession.builder.appName("MyPySparkAutomatic").getOrCreate()

# Your PySpark session is now created and you can use it to read data, perform transformations, and write data.
print("SparkSession created successfully!")




Read The CSV Data(DataFrame: flight_df_1)

In [None]:
# Read the CSV file into a Spark DataFrame
file_path = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\2015-summary.csv"
flight_df_1 = spark.read.format("csv") \
	.option("header", "true") \
	.option("inferSchema", "true") \
	.option("mode", "PERMISSIVE") \
	.load(file_path)

# Show the DataFrame
flight_df_1.show(n=flight_df_1.count(), truncate=False)

#Show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", flight_df_1.count())

# Print the schema of the DataFrame
flight_df_1.printSchema()

Create The Manual Schema(DataFrame: flight_df_2)

In [None]:

# File paths
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_1 = os.path.join(file_path_0, "2015-summary.csv")
file_path_2 = os.path.join(file_path_0, "bad_records")

# Create a Manual schema for the DataFrame
my_schema = StructType([StructField("COUNTRY_1", StringType(), True),
                        StructField("COUNTRY_2", StringType(), True),
                        StructField("TOTAL_COUNT", IntegerType(), True),
                        StructField("_corrupt_record", StringType(), True)
                        ])
                        
# Read the CSV file into a Spark DataFrame with the manual schema
# Read CSV with schema and capture bad records
flight_df_2 = spark.read.format("csv") \
    .option("header", "true") \
    .schema(my_schema) \
    .option("mode", "PERMISSIVE") \
    .load(file_path_1)

#Shows the dataframe with bad records
flight_df_2.show(n=flight_df_2.count(), truncate=False)

# Filter and write bad records manually
bad_df = flight_df_2.filter("`_corrupt_record` IS NOT NULL")
bad_df.write.mode("overwrite").json(file_path_2)


# List the contents of the directory
import os
print("Contents of the bad records directory:")
if os.path.exists(file_path_2):
    for item in os.listdir(file_path_2):
        print(item)
else:
    print("No bad records directory found.")



How to read json file in pyspark(DataFrame: people_df)

In [None]:
# File paths
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_1 = os.path.join(file_path_0, "2015-summary.csv")
file_path_2 = os.path.join(file_path_0, "bad_records")
file_path_3 = os.path.join(file_path_0, "multiline.json")

people_df=spark.read.format("json") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiline", "true") \
    .option("mode", "PERMISSIVE") \
    .load(file_path_3)
# Show the DataFrame
people_df.show(n=people_df.count(), truncate=False)
# Show the total number of rows in the DataFrame
people_df.count()



How to read Parquet file in pyspark(DataFrame: people_df_2)

In [None]:
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_4 = os.path.join(file_path_0, "part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet")

parquet_df = spark.read.format("parquet") \
    .load(file_path_4)
# Show the DataFrame
parquet_df.show(n=parquet_df.count(), truncate=False)
# Show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", parquet_df.count())


Finding Metadata Information In Parquet File

In [None]:
########################################################################################################################################################
import pandas as pd
import pyarrow.parquet as pq

# Path to your .parquet file
file_path = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet"

try:
    # Read the whole Parquet file
    df = pd.read_parquet(file_path, engine="pyarrow")

    # ✅ Display all rows and columns
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

    # 🔽 Optionally, export to CSV for easier viewing
    #output_csv = file_path.replace(".parquet", ".csv")
    #df.to_csv(output_csv, index=False)
    #print(f"\n✅ Full data exported to: {output_csv}")

except Exception as e:
    print("❌ Error reading Parquet file:")
    print(e)



# Load the Parquet file
parquet_file = pq.ParquetFile(
    r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet"
)

print("File metadata:")
print(parquet_file.metadata)

print("\nFirst row group metadata:")
print(parquet_file.metadata.row_group(0))

print("\nFirst column in first row group:")
print(parquet_file.metadata.row_group(0).column(0))

print("\nColumn statistics:")
print(parquet_file.metadata.row_group(0).column(0).statistics)



How To Write Dataframe on disk(dataframe:parquet_df)

In [None]:
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_4 = os.path.join(file_path_0, "part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet")
# Example: Write the parquet_df DataFrame to Parquet format
parquet_df.repartition(2).write.format("csv") \
    .mode("overwrite") \
    .partitionBy("DEST_COUNTRY_NAME") \
    .save(os.path.join(file_path_0, "parquet_df_write_repartioned_2"))


Implementing Partitioning & Bucketing in Pyspark

In [None]:
parquet_df.write.format("csv") \
    .mode("overwrite") \
    .partitionBy("DEST_COUNTRY_NAME") \
    .save(os.path.join(file_path_0, "parquet_df_write_partition_by:Example"))
