## 1.PySpark Connection part

In [1]:
#Connection part for Pyspark and importing required packages.
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import *

spark = SparkSession.builder.appName("Pyspark_VS_Pandas").getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.driver.memory', '4g'), ('spark.executor.memory', '4g'), ('spark.executor.num','6'), ('spark.network.timeout', '1000000')])

## 2.Reading data into Pyspark dataframe

In [2]:
#Creating Pyspark dataframe so that we apply SQL scripts for our practice
#    1.Read it into Pandas df
#    2.convert into pyspark df by defining datatypes of each columns
#    [we can use spark.read.format("") option, but it requires additional packages installation so skipped this way]


student_dfpd = pd.read_excel(r'Table_Source\Student_Placement_Table.xlsx')
schema = StructType([\
                     StructField("ID",IntegerType(),False),\
                     StructField("Name",StringType(),False),\
                     StructField("Gender",StringType(),False),\
                     StructField("DOB",DateType(),False),\
                     StructField("Location",StringType(),True),\
                     StructField("University",StringType(),False),\
                     StructField("Salary",DoubleType(),False),\
                     StructField("Company",StringType(),False)])

student_dfps = spark.createDataFrame(student_dfpd,schema)

## 3.PySpark DataFrame to TempView + Columns datatype

In [3]:
#A. Create pyspark dataframe into Temporary view for applying SQL scripts
student_dfps.createOrReplaceTempView("Student_Table")

#B. Checking the datatypes of columns
student_dfps.printSchema()

root
 |-- ID: integer (nullable = false)
 |-- Name: string (nullable = false)
 |-- Gender: string (nullable = false)
 |-- DOB: date (nullable = false)
 |-- Location: string (nullable = true)
 |-- University: string (nullable = false)
 |-- Salary: double (nullable = false)
 |-- Company: string (nullable = false)



In [4]:
#print total count of records 
print("Total records of Student_Table = ",student_dfps.count())

#List all the records in table
sql_query = "SELECT * FROM Student_Table"
spark.sql(sql_query).show(30)

Total records of Student_Table =  24
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|102| BBB|     F|1995-09-20|     HYD|      IIIT|76000.2|   Amazon|
|103| CCC|     M|1992-12-31| Chennai|       NIT|49200.5|   Google|
|104| DDD|     F|1990-11-22|  Mumbai|       VIT|54980.6|    Apple|
|105| EEE|     M|1993-05-19| Chennai|       IIT|60200.7|Microsoft|
|106| FFF|     M|1994-07-23|     HYD|      IIIT|63100.8|Microsoft|
|107| GGG|     F|1994-10-10|  Mumbai|       VIT|60200.7|   Amazon|
|108| HHH|     M|1990-10-10|Banglore|       NIT|89200.7|    Apple|
|109| III|     F|1994-12-21|     HYD|      IIIT|66980.8|   Google|
|110| JJJ|     M|1990-11-22| Chennai|       NIT|59250.2|   Amazon|
|111| KKK|     M|1994-10-10|Banglore|      IISC|76300.9|Microsoft|
|112| LLL|     F|1995-09-

# 😎

# 4.SQL Practice starts here

# @##############################################################@

## 4A.Select statement + Alias names + Limit + Count(*)

In [5]:
#Select statement used to select(Print) data 
#We can give perticular column names to print, or use * to print all columns
# Table name = Student_Table
#show(5) for limiting records tobe printed

print("4A1). Print only ID, NAME, GENDER columns")
sql_query="""SELECT ID, NAME, GENDER FROM Student_Table"""
spark.sql(sql_query).show(5)

4A1). Print only ID, NAME, GENDER columns
+---+----+------+
| ID|NAME|GENDER|
+---+----+------+
|101| AAA|     M|
|102| BBB|     F|
|103| CCC|     M|
|104| DDD|     F|
|105| EEE|     M|
+---+----+------+
only showing top 5 rows



In [6]:
#use * for printing all columns data to console, ##show(5) for limiting records tobe printed
print("4A2). Print all columns from table")
sql_query="""SELECT * FROM Student_Table"""
spark.sql(sql_query).show(5)

4A2). Print all columns from table
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|102| BBB|     F|1995-09-20|     HYD|      IIIT|76000.2|   Amazon|
|103| CCC|     M|1992-12-31| Chennai|       NIT|49200.5|   Google|
|104| DDD|     F|1990-11-22|  Mumbai|       VIT|54980.6|    Apple|
|105| EEE|     M|1993-05-19| Chennai|       IIT|60200.7|Microsoft|
+---+----+------+----------+--------+----------+-------+---------+
only showing top 5 rows



In [7]:
#Renaming columns with more meaningful names
print("4A3). Alias name for ID, Name columns")
sql_query="""SELECT ID as ID_Number, Name as Name_of_Student FROM Student_Table"""
spark.sql(sql_query).show(5)

4A3). Alias name for ID, Name columns
+---------+---------------+
|ID_Number|Name_of_Student|
+---------+---------------+
|      101|            AAA|
|      102|            BBB|
|      103|            CCC|
|      104|            DDD|
|      105|            EEE|
+---------+---------------+
only showing top 5 rows



In [8]:
#Limiting number of records with LIMIT
print("4A4). Limiting number of records tobe printing on console with Limit by 4")
sql_query="""SELECT * FROM Student_Table LIMIT 4"""
spark.sql(sql_query).show()

4A4). Limiting number of records tobe printing on console with Limit by 4
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|102| BBB|     F|1995-09-20|     HYD|      IIIT|76000.2|   Amazon|
|103| CCC|     M|1992-12-31| Chennai|       NIT|49200.5|   Google|
|104| DDD|     F|1990-11-22|  Mumbai|       VIT|54980.6|    Apple|
+---+----+------+----------+--------+----------+-------+---------+



In [21]:
#Count(*) function used to return total number of records that are matching given criteria
    #if no filter given for count(*), it will print total records in given table

print("4A5). Print total records in given table")
sql_query="""SELECT count(*)as Total_Count FROM Student_Table"""
spark.sql(sql_query).show()

4A5). Print total records in given table
+-----------+
|Total_Count|
+-----------+
|         24|
+-----------+



In [26]:
#Selectng some random text using select statement

print("4A6). Print some sample text using select statement")
sql_query="""SELECT 'Hello I am SQL' as Column_Name """
spark.sql(sql_query).show()

4A6). Print some sample text using select statement
+--------------+
|   Column_Name|
+--------------+
|Hello I am SQL|
+--------------+



## 4B.Distinct statement

In [10]:
#Distinct statement used for listing only distinct(different) values in column or list of columns
print("4B1). Without Distinct statement, it will lsit all reocrds in that column(s)")
sql_query="SELECT Location FROM Student_Table"
spark.sql(sql_query).show(30)

print("4B2). With Distinct statement, it will lsit only distinct reocrds in that column(s)")
sql_query="SELECT distinct Location FROM Student_Table"
spark.sql(sql_query).show(30)

4B1). Without Distinct statement, it will lsit all reocrds in that column(s)
+--------+
|Location|
+--------+
|Banglore|
|     HYD|
| Chennai|
|  Mumbai|
| Chennai|
|     HYD|
|  Mumbai|
|Banglore|
|     HYD|
| Chennai|
|Banglore|
|  Mumbai|
|     HYD|
|Banglore|
|Banglore|
| Chennai|
|Banglore|
|  Mumbai|
|     HYD|
| Chennai|
| Chennai|
|Banglore|
|  Mumbai|
|     HYD|
+--------+

4B2). With Distinct statement, it will lsit only distinct reocrds in that column(s)
+--------+
|Location|
+--------+
| Chennai|
|  Mumbai|
|     HYD|
|Banglore|
+--------+



## 4C.WHERE clause + BETWEEN + LIKE + IN + AND + OR + IS NULL

In [11]:
#Where clause used to filter reocrds based on given condition on columns
#Below query will filter records from location Banglore

print("4C1). Print records only from Banglore location")
sql_query="SELECT * FROM Student_Table WHERE Location = 'Banglore'"
spark.sql(sql_query).show(30)

4C1). Print records only from Banglore location
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|108| HHH|     M|1990-10-10|Banglore|       NIT|89200.7|    Apple|
|111| KKK|     M|1994-10-10|Banglore|      IISC|76300.9|Microsoft|
|114| NNN|     F|1990-11-29|Banglore|      IIIT|59200.5|   Amazon|
|115| OOO|     M|1995-09-20|Banglore|      IISC|57120.5|   Google|
|117| QQQ|     M|1991-02-10|Banglore|      IISC|60200.7|Microsoft|
|122| VVV|     M|1993-08-19|Banglore|      IISC|57120.5|Microsoft|
+---+----+------+----------+--------+----------+-------+---------+



In [12]:
print("4C2). Print records only ID range from 105 to 109 Inclusive")
sql_query="""SELECT * FROM Student_Table WHERE ID BETWEEN 105 AND 109"""
spark.sql(sql_query).show(30)

4C2). Print records only ID range from 105 to 109 Inclusive
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|105| EEE|     M|1993-05-19| Chennai|       IIT|60200.7|Microsoft|
|106| FFF|     M|1994-07-23|     HYD|      IIIT|63100.8|Microsoft|
|107| GGG|     F|1994-10-10|  Mumbai|       VIT|60200.7|   Amazon|
|108| HHH|     M|1990-10-10|Banglore|       NIT|89200.7|    Apple|
|109| III|     F|1994-12-21|     HYD|      IIIT|66980.8|   Google|
+---+----+------+----------+--------+----------+-------+---------+



In [13]:
print("4C3). Print records only Company value contains soft")
sql_query="""SELECT * FROM Student_Table WHERE COMPANY LIKE "%soft%" """
spark.sql(sql_query).show(30)

4C3). Print records only Company value contains soft
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|105| EEE|     M|1993-05-19| Chennai|       IIT|60200.7|Microsoft|
|106| FFF|     M|1994-07-23|     HYD|      IIIT|63100.8|Microsoft|
|111| KKK|     M|1994-10-10|Banglore|      IISC|76300.9|Microsoft|
|117| QQQ|     M|1991-02-10|Banglore|      IISC|60200.7|Microsoft|
|119| SSS|     M|1992-10-25|     HYD|       VIT|62900.5|Microsoft|
|122| VVV|     M|1993-08-19|Banglore|      IISC|57120.5|Microsoft|
|124| XXX|     M|1991-12-19|     HYD|       IIT|60200.7|Microsoft|
+---+----+------+----------+--------+----------+-------+---------+



In [14]:
print("4C4). Print records only Name in given list(AAA, GGG, KKK)")
sql_query="""SELECT * FROM Student_Table WHERE NAME IN ('AAA', 'GGG', 'KKK') """
spark.sql(sql_query).show(30)

4C4). Print records only Name in given list(AAA, GGG, KKK)
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|107| GGG|     F|1994-10-10|  Mumbai|       VIT|60200.7|   Amazon|
|111| KKK|     M|1994-10-10|Banglore|      IISC|76300.9|Microsoft|
+---+----+------+----------+--------+----------+-------+---------+



In [15]:
#And should satisfy all conditions
print("4C5). Print records from Banglore location and Microsoft company")
sql_query="""SELECT * FROM Student_Table WHERE (LOCATION ='Banglore' AND COMPANY ='Microsoft') """
spark.sql(sql_query).show(30)

4C5). Print records from Banglore location and Microsoft company
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|111| KKK|     M|1994-10-10|Banglore|      IISC|76300.9|Microsoft|
|117| QQQ|     M|1991-02-10|Banglore|      IISC|60200.7|Microsoft|
|122| VVV|     M|1993-08-19|Banglore|      IISC|57120.5|Microsoft|
+---+----+------+----------+--------+----------+-------+---------+



In [16]:
#OR should satisfy any one conditions, Either of the condition will meet the output
print("4C6). Print records from Banglore location or Microsoft company")
sql_query="""SELECT * FROM Student_Table WHERE (LOCATION ='Banglore' OR COMPANY ='Microsoft') """
spark.sql(sql_query).show(30)

4C6). Print records from Banglore location or Microsoft company
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|105| EEE|     M|1993-05-19| Chennai|       IIT|60200.7|Microsoft|
|106| FFF|     M|1994-07-23|     HYD|      IIIT|63100.8|Microsoft|
|108| HHH|     M|1990-10-10|Banglore|       NIT|89200.7|    Apple|
|111| KKK|     M|1994-10-10|Banglore|      IISC|76300.9|Microsoft|
|114| NNN|     F|1990-11-29|Banglore|      IIIT|59200.5|   Amazon|
|115| OOO|     M|1995-09-20|Banglore|      IISC|57120.5|   Google|
|117| QQQ|     M|1991-02-10|Banglore|      IISC|60200.7|Microsoft|
|119| SSS|     M|1992-10-25|     HYD|       VIT|62900.5|Microsoft|
|122| VVV|     M|1993-08-19|Banglore|      IISC|57120.5|Microsoft|
|124| XXX|     M|1991-12-19|     HYD|       IIT|60200.7|Microsoft

In [17]:
#Is Null will filter given column having Null values, #Is Not Null will print having proper value
# Null is nothing but, missing value in any column(Except Primay key column), it will represent with some meaningful value
print("4C7). Print records where University is not null, Here all records will be printed because all rows are having Proper values in University column")
sql_query="""SELECT * FROM Student_Table WHERE University IS NOT NULL"""
spark.sql(sql_query).show(30)

4C7). Print records where University is not null, Here all records will be printed because all rows are having Proper values in University column
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|102| BBB|     F|1995-09-20|     HYD|      IIIT|76000.2|   Amazon|
|103| CCC|     M|1992-12-31| Chennai|       NIT|49200.5|   Google|
|104| DDD|     F|1990-11-22|  Mumbai|       VIT|54980.6|    Apple|
|105| EEE|     M|1993-05-19| Chennai|       IIT|60200.7|Microsoft|
|106| FFF|     M|1994-07-23|     HYD|      IIIT|63100.8|Microsoft|
|107| GGG|     F|1994-10-10|  Mumbai|       VIT|60200.7|   Amazon|
|108| HHH|     M|1990-10-10|Banglore|       NIT|89200.7|    Apple|
|109| III|     F|1994-12-21|     HYD|      IIIT|66980.8|   Google|
|110| JJJ|     M|1990-11-22| Chennai|       NIT|59

# 4D.Order By

In [18]:
#Bydefault it is Accending order, for descending order we have to use DESC keyword
    #Number: 0 to n bydefault, DESC: n to 0
    #Alphabets: A to Z bydefault, DESC: Z to A
print("4D1). Sort by Salary Accending order top 5 records")
sql_query="""SELECT * FROM Student_Table ORDER BY Salary LIMIT 5"""
spark.sql(sql_query).show()


4D1). Sort by Salary Accending order top 5 records
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|103| CCC|     M|1992-12-31| Chennai|       NIT|49200.5|   Google|
|118| RRR|     F|1993-11-10|  Mumbai|       NIT|52900.5|   Google|
|104| DDD|     F|1990-11-22|  Mumbai|       VIT|54980.6|    Apple|
|101| AAA|     M|1994-10-10|Banglore|      IISC|55000.5|Microsoft|
|115| OOO|     M|1995-09-20|Banglore|      IISC|57120.5|   Google|
+---+----+------+----------+--------+----------+-------+---------+



In [19]:
print("4D2). Sort by Name Descending order top 5 records")
sql_query="""SELECT * FROM Student_Table ORDER BY Name DESC LIMIT 5"""
spark.sql(sql_query).show()

4D2). Sort by Name Descending order top 5 records
+---+----+------+----------+--------+----------+-------+---------+
| ID|Name|Gender|       DOB|Location|University| Salary|  Company|
+---+----+------+----------+--------+----------+-------+---------+
|124| XXX|     M|1991-12-19|     HYD|       IIT|60200.7|Microsoft|
|123| WWW|     F|1994-09-14|  Mumbai|       VIT|59050.5|    Apple|
|122| VVV|     M|1993-08-19|Banglore|      IISC|57120.5|Microsoft|
|121| UUU|     F|1990-11-13| Chennai|       NIT|59250.2|   Google|
|120| TTT|     M|1995-09-29| Chennai|      IIIT|57230.5|    Apple|
+---+----+------+----------+--------+----------+-------+---------+



# 4E. Upper() + Lower() + Length()

In [22]:
#Upper()-->Convert given column data into upper case data
#Lower()-->Convert given column data into lower case data
#Length()-->It will print total characters in columns data including spaces

print("4E1). Apply Upper(), Lower(), Length() functions to columns")
sql_query="""SELECT DISTINCT COMPANY,UPPER(COMPANY), LOWER(COMPANY), LENGTH(COMPANY) FROM Student_Table"""
spark.sql(sql_query).show()

4E1). Apply Upper(), Lower(), Length() functions to columns
+---------+--------------+--------------+---------------+
|  COMPANY|upper(COMPANY)|lower(COMPANY)|length(COMPANY)|
+---------+--------------+--------------+---------------+
|Microsoft|     MICROSOFT|     microsoft|              9|
|    Apple|         APPLE|         apple|              5|
|   Amazon|        AMAZON|        amazon|              6|
|   Google|        GOOGLE|        google|              6|
+---------+--------------+--------------+---------------+



# 4F. Concatination(||) + BooleanExpression + TRIM()

In [35]:
#Concatination using ||
    #This will help to club mutiple columns and some text into single column

print("4F1). Concatination using || symbol")
sql_query="""SELECT 'I am ' || Name || ' from ' || University as SelfIntro FROM Student_Table LIMIT 10"""
spark.sql(sql_query).show()

4F1). Concatination using || symbol
+------------------+
|         SelfIntro|
+------------------+
|I am AAA from IISC|
|I am BBB from IIIT|
| I am CCC from NIT|
| I am DDD from VIT|
| I am EEE from IIT|
|I am FFF from IIIT|
| I am GGG from VIT|
| I am HHH from NIT|
|I am III from IIIT|
| I am JJJ from NIT|
+------------------+



In [49]:
#Boolean Expression with some condition:
    #THis will print True or False values 

print("4F2). Boolean Expression with some condition")
sql_query="""SELECT ID, NAME, SALARY, (Salary > 60000) As IsSalaryGraterThan60K FROM Student_Table LIMIT 10"""
spark.sql(sql_query).show()

4F2). Boolean Expression with some condition
+---+----+-------+---------------------+
| ID|NAME| SALARY|IsSalaryGraterThan60K|
+---+----+-------+---------------------+
|101| AAA|55000.5|                false|
|102| BBB|76000.2|                 true|
|103| CCC|49200.5|                false|
|104| DDD|54980.6|                false|
|105| EEE|60200.7|                 true|
|106| FFF|63100.8|                 true|
|107| GGG|60200.7|                 true|
|108| HHH|89200.7|                 true|
|109| III|66980.8|                 true|
|110| JJJ|59250.2|                false|
+---+----+-------+---------------------+



In [57]:
#Trim() function used to remove extra spaces in column's data

print("4F3). Trim() function used to remove extra spaces in column's data")

sql_query="""SELECT 
'   Google    ' AS ExtraSpaces, LENGTH('   Google    ') AS Len_ExtraSpaces,
TRIM('   Google    ') AS TrimApplied, LENGTH(TRIM('   Google    ')) AS Len_TrimApplied
"""
spark.sql(sql_query).show()

4F3). Trim() function used to remove extra spaces in column's data
+-------------+---------------+-----------+---------------+
|  ExtraSpaces|Len_ExtraSpaces|TrimApplied|Len_TrimApplied|
+-------------+---------------+-----------+---------------+
|   Google    |             13|     Google|              6|
+-------------+---------------+-----------+---------------+



In [None]:
print("4).")
sql_query="""SELECT * FROM Student_Table"""
spark.sql(sql_query).show(5)

# Ruff

In [3]:
Student_df = spark.read.format("com.crealytics.spark.excel").option("useHeader","true").option("inferSchema","false").load("Student_Placement_Table.xlsx")

Py4JJavaError: An error occurred while calling o44.load.
: java.lang.ClassNotFoundException: Failed to find data source: com.crealytics.spark.excel. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:657)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:194)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.crealytics.spark.excel.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at scala.util.Try.orElse(Try.scala:84)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
	... 13 more


In [38]:
#python dict
dict1 = {"Name": ["A","B","C","D","E","F","G","H","I"],\
         "Weight":[70,61,83,60,92,69,84,71,77],\
         "Address":["HYD","Banglore","Chennai","Mumbai","Banglore","Mumbai","Chennai","Banglore","HYD"],\
         "DOB":["15-01-1990", "19-01-1996", "28-02-1999", "13-06-1989", "15-11-2000", "10-12-1995", "25-11-1998", "15-09-1994", "15-01-1996"],\
         "Batch":[2016, 2017, 2018, 2016, 2016, 2017, 2016, 2018, 2017],\
         "Salary":[51000.00, 46500.50, 52000.00, 51000.00, 52000.00, 75000.60, 64000.50, 52000.00, 46500.50]         
        }

#create pandas df
dfpd = pd.DataFrame(dict1)
dfpd_dtype = {"Name":'str', "Weight":'int64', "Address":'str', "DOB":'datetime64', "Batch":'int64', "Salary":'float64' }
dfpd = dfpd.astype(dfpd_dtype)

In [39]:
#Schema for spark dataframe with Structtype, fields
schema = StructType([\
                     StructField("Name", StringType(), True),\
                     StructField("Weight", IntegerType(), True),\
                     StructField("Address", StringType(), True),\
                     StructField("DOB", DateType(), True),\
                     StructField("Batch", IntegerType(), True),\
                     StructField("Salary", DoubleType(), True)])

#create spark DF by passing pandas df with above schema
dfps = spark.createDataFrame(dfpd, schema)

In [40]:
dfps.show()

+----+------+--------+----------+-----+-------+
|Name|Weight| Address|       DOB|Batch| Salary|
+----+------+--------+----------+-----+-------+
|   A|    70|     HYD|1990-01-15| 2016|51000.0|
|   B|    61|Banglore|1996-01-19| 2017|46500.5|
|   C|    83| Chennai|1999-02-28| 2018|52000.0|
|   D|    60|  Mumbai|1989-06-13| 2016|51000.0|
|   E|    92|Banglore|2000-11-15| 2016|52000.0|
|   F|    69|  Mumbai|1995-10-12| 2017|75000.6|
|   G|    84| Chennai|1998-11-25| 2016|64000.5|
|   H|    71|Banglore|1994-09-15| 2018|52000.0|
|   I|    77|     HYD|1996-01-15| 2017|46500.5|
+----+------+--------+----------+-----+-------+

