#### Agenda: To work with multiple file formats and save them into py dataframe ready for consumption

In [21]:
from pyspark.sql import SparkSession
import pandas as pd
import pyspark.pandas as ps

In [3]:
# Spark session
try:
    spark = SparkSession.builder.master("local").appName("PysparkLearning").getOrCreate()
    if spark is not None:
        print("Spark session created successfully")
    else:
        print("Please check your spark session variable")
except Exception as e:
    print("Exception occured during spark session creation with error :"+str(e))

Spark session created successfully


#### Reading csv file

In [15]:
csvFileName = "D:/Pyspark_latest/Testing with multiple file formats/csv/Titanic-Dataset.csv"

#make sure you give the header = True else spark will create default columns
csv_df = spark.read.csv(csvFileName,inferSchema = True ,header=True, sep=",")

In [16]:
csv_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

#### Reading excel file
Spark doesn't have native excel reader , we need to use a seperate maven dependency to read excel files using pandas and convert to spark df

In [29]:
excelFilePath = "D:/Pyspark_latest/Testing with multiple file formats/csv/Titanic-Dataset.xlsx"
excel_df = pd.read_excel(excelFilePath)

# converting to spark df
excel_sp_df = spark.createDataFrame(excel_df)

In [31]:
excel_sp_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25|  NaN|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925|  NaN|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05|  NaN|       S|
|          6|       0|     3|    Moran, Mr. James|  male| NaN|    0|    0|      

#### Reading parquet file
#### Reference : https://www.databricks.com/glossary/what-is-parquet

In [32]:
pq_filePath = r"D:\Pyspark_latest\Testing with multiple file formats\parquet\titanic.parquet"
pq_df = spark.read.parquet(pq_filePath)

In [35]:
pq_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      