# <center> spark-basic </center>

## What can Spark do
* **SQL-like operation. Traditional data analytics.**
* Machine learning. Advanced data analytics.
* Distributed / Parallel computing which can deal with big data 

## Below capabilities will be demoed in this section
* create dataframe from python data structure
* read/write csv data file
* select & filter
* add and drop columns
* join (all types)
* groupby
* fill null/missing value
* data schema
* change column name

In [1]:
import os
import findspark

findspark.init(os.environ['SPARK_HOME'])

In [2]:
from pyspark.sql import SparkSession

## Create a spark session

This is the entry point of spark operations

In [3]:
spark = SparkSession.builder.appName('spark').getOrCreate()

## Create dataframe from Python array of tuples

In [9]:
name = spark.createDataFrame(
    [(0, "Alex"), (1, "Bob"), (2, "Cherry"), (3, "Dan"), (4, "Ethan"), (5, "Flynn")],
    ["id", "name"],
)

math = spark.createDataFrame(
    [(0, 95), (1, 98), (2, 73), (3, 54), (4, 68), (5, 98)],
    ["id", "math"],
)

english = spark.createDataFrame(
    [(0, 90), (1, 80), (2, 85), (3, 68), (4, 65), (5, 97)],
    ["id", "english"],
)

chinese = spark.createDataFrame(
    [(0, 79), (1, 89), (2, 86), (3, 57), (4, 86), (5, 99)],
    ["id", "chinese"],
)

physics = spark.createDataFrame(
    [(0, 86), (1, 95), (2, 88), (3, 96), (4, 68), (5, 96)],
    ["id", "physics"],
)

chemistry = spark.createDataFrame(
    [(0, 67), (1, 71), (2, 85), (3, 68), (4, 95), (5, 95)],
    ["id", "chemistry"],
)

history = spark.createDataFrame(
    [(0, 73), (1, 80), (2, 91), (3, 57), (4, 78), (5, 99)],
    ["id", "history"],
)

## Basic Spark operations

In [10]:
# print first 20 rows
name.show()

+---+------+
| id|  name|
+---+------+
|  0|  Alex|
|  1|   Bob|
|  2|Cherry|
|  3|   Dan|
|  4| Ethan|
|  5| Flynn|
+---+------+



In [12]:
english.show(4)

+---+-------+
| id|english|
+---+-------+
|  0|     90|
|  1|     80|
|  2|     85|
|  3|     68|
+---+-------+
only showing top 4 rows



In [11]:
math.show(2)

+---+----+
| id|math|
+---+----+
|  0|  95|
|  1|  98|
+---+----+
only showing top 2 rows



In [14]:
result = name.join(math, how='left', on='id')
result.show()

+---+------+----+
| id|  name|math|
+---+------+----+
|  0|  Alex|  95|
|  1|   Bob|  98|
|  2|Cherry|  73|
|  3|   Dan|  54|
|  5| Flynn|  98|
|  4| Ethan|  68|
+---+------+----+



In [15]:
for x in [english, chinese, physics, chemistry, history]:
    result = result.join(x, how='left', on='id')

In [16]:
result.show()

+---+------+----+-------+-------+-------+---------+-------+
| id|  name|math|english|chinese|physics|chemistry|history|
+---+------+----+-------+-------+-------+---------+-------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|
|  1|   Bob|  98|     80|     89|     95|       71|     80|
|  2|Cherry|  73|     85|     86|     88|       85|     91|
|  3|   Dan|  54|     68|     57|     96|       68|     57|
|  5| Flynn|  98|     97|     99|     96|       95|     99|
|  4| Ethan|  68|     65|     86|     68|       95|     78|
+---+------+----+-------+-------+-------+---------+-------+



<hr/>

## Joins

![join_types](./pics/join_types.png)

In [17]:
biology = spark.createDataFrame(
    [(0, 89), (1, 87), (3, 88), (5, 95)], ['id', 'biology']
)

In [18]:
biology.show()

+---+-------+
| id|biology|
+---+-------+
|  0|     89|
|  1|     87|
|  3|     88|
|  5|     95|
+---+-------+



In [19]:
french = spark.createDataFrame([(0, 75), (2, 86), (5, 99)], ['id', 'french'])

In [20]:
french.show()

+---+------+
| id|french|
+---+------+
|  0|    75|
|  2|    86|
|  5|    99|
+---+------+



### inner

In [21]:
biology.join(french, on='id', how='inner').show()

+---+-------+------+
| id|biology|french|
+---+-------+------+
|  0|     89|    75|
|  5|     95|    99|
+---+-------+------+



### outer, full, fullouter, full_outer

In [22]:
biology.join(french, on='id', how='outer').show()
#biology.join(french, on='id', how='full').show()
#biology.join(french, on='id', how='fullouter').show()
#biology.join(french, on='id', how='full_outer').show()

+---+-------+------+
| id|biology|french|
+---+-------+------+
|  0|     89|    75|
|  1|     87|  null|
|  2|   null|    86|
|  3|     88|  null|
|  5|     95|    99|
+---+-------+------+



### left, leftouter, left_outer

In [23]:
biology.join(french, on='id', how='left').show()
#biology.join(french, on='id', how='leftouter').show()
#biology.join(french, on='id', how='left_outer').show()

+---+-------+------+
| id|biology|french|
+---+-------+------+
|  0|     89|    75|
|  1|     87|  null|
|  3|     88|  null|
|  5|     95|    99|
+---+-------+------+



### right, rightouter, right_outer

In [24]:
biology.join(french, on='id', how='right').show()
#biology.join(french, on='id', how='rightouter').show()
#biology.join(french, on='id', how='right_outer').show()

+---+-------+------+
| id|biology|french|
+---+-------+------+
|  0|     89|    75|
|  2|   null|    86|
|  5|     95|    99|
+---+-------+------+



### leftanti, left_anti

In [25]:
# in left table, not in right table
biology.join(french, on='id', how='leftanti').show()
#biology.join(french, on='id', how='left_anti').show()

+---+-------+
| id|biology|
+---+-------+
|  1|     87|
|  3|     88|
+---+-------+



### leftsemi, left_semi

In [26]:
# only get columns from the left table
biology.join(french, on='id', how='leftsemi').show()
#biology.join(french, on='id', how='left_semi').show()

+---+-------+
| id|biology|
+---+-------+
|  0|     89|
|  5|     95|
+---+-------+



### end of join section.. continue.. 

In [28]:
result = result.cache()  # .cache(): cache the dataframe in memory

In [29]:
result.count()

6

In [30]:
result.show()

+---+------+----+-------+-------+-------+---------+-------+
| id|  name|math|english|chinese|physics|chemistry|history|
+---+------+----+-------+-------+-------+---------+-------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|
|  5| Flynn|  98|     97|     99|     96|       95|     99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|
|  3|   Dan|  54|     68|     57|     96|       68|     57|
|  2|Cherry|  73|     85|     86|     88|       85|     91|
|  4| Ethan|  68|     65|     86|     68|       95|     78|
+---+------+----+-------+-------+-------+---------+-------+



In [31]:
# Show statistics of all columns
result.describe().show()

+-------+------------------+-----+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|                id| name|              math|           english|           chinese|           physics|         chemistry|           history|
+-------+------------------+-----+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|                 6|    6|                 6|                 6|                 6|                 6|                 6|                 6|
|   mean|               2.5| null|              81.0| 80.83333333333333| 82.66666666666667| 88.16666666666667| 80.16666666666667| 79.66666666666667|
| stddev|1.8708286933869707| null|18.633303518163384|12.480651692386365|14.151560573543353|10.778064142816497|13.182058514005567|14.583095236151571|
|    min|                 0| Alex|                54|                65|                57|               

In [32]:
# Show statistics of specified columns
result.describe(['math', 'english', 'chinese']).show()

+-------+------------------+------------------+------------------+
|summary|              math|           english|           chinese|
+-------+------------------+------------------+------------------+
|  count|                 6|                 6|                 6|
|   mean|              81.0| 80.83333333333333| 82.66666666666667|
| stddev|18.633303518163384|12.480651692386365|14.151560573543353|
|    min|                54|                65|                57|
|    max|                98|                97|                99|
+-------+------------------+------------------+------------------+



In [34]:
summary = result.describe()
summary.select(['summary', 'id', 'name', 'math']).show()

+-------+------------------+-----+------------------+
|summary|                id| name|              math|
+-------+------------------+-----+------------------+
|  count|                 6|    6|                 6|
|   mean|               2.5| null|              81.0|
| stddev|1.8708286933869707| null|18.633303518163384|
|    min|                 0| Alex|                54|
|    max|                 5|Flynn|                98|
+-------+------------------+-----+------------------+



In [35]:
biology = spark.createDataFrame(
    [(0, 89), (1, 87), (3, 88), (5, 95)],
    ["id", "biology"]
)
biology.show()

+---+-------+
| id|biology|
+---+-------+
|  0|     89|
|  1|     87|
|  3|     88|
|  5|     95|
+---+-------+



In [36]:
result.show()

+---+------+----+-------+-------+-------+---------+-------+
| id|  name|math|english|chinese|physics|chemistry|history|
+---+------+----+-------+-------+-------+---------+-------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|
|  5| Flynn|  98|     97|     99|     96|       95|     99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|
|  3|   Dan|  54|     68|     57|     96|       68|     57|
|  2|Cherry|  73|     85|     86|     88|       85|     91|
|  4| Ethan|  68|     65|     86|     68|       95|     78|
+---+------+----+-------+-------+-------+---------+-------+



In [37]:
result_with_biology = result.join(biology, how='inner', on='id')
result_with_biology.show()

+---+-----+----+-------+-------+-------+---------+-------+-------+
| id| name|math|english|chinese|physics|chemistry|history|biology|
+---+-----+----+-------+-------+-------+---------+-------+-------+
|  0| Alex|  95|     90|     79|     86|       67|     73|     89|
|  1|  Bob|  98|     80|     89|     95|       71|     80|     87|
|  3|  Dan|  54|     68|     57|     96|       68|     57|     88|
|  5|Flynn|  98|     97|     99|     96|       95|     99|     95|
+---+-----+----+-------+-------+-------+---------+-------+-------+



In [38]:
result_with_biology2 = result.join(biology, how='left', on='id')
result_with_biology2.show()

+---+------+----+-------+-------+-------+---------+-------+-------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|
+---+------+----+-------+-------+-------+---------+-------+-------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|
|  2|Cherry|  73|     85|     86|     88|       85|     91|   null|
|  4| Ethan|  68|     65|     86|     68|       95|     78|   null|
+---+------+----+-------+-------+-------+---------+-------+-------+



In [39]:
french = spark.createDataFrame(
    [(0, 75), (2, 86), (5, 99)],
    ["id", "french"]
)
french.show()

+---+------+
| id|french|
+---+------+
|  0|    75|
|  2|    86|
|  5|    99|
+---+------+



In [40]:
result3 = result_with_biology2.join(french, how='left', on='id')
result3.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|
+---+------+----+-------+-------+-------+---------+-------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|  null|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|  null|
|  2|Cherry|  73|     85|     86|     88|       85|     91|   null|    86|
|  4| Ethan|  68|     65|     86|     68|       95|     78|   null|  null|
+---+------+----+-------+-------+-------+---------+-------+-------+------+



In [44]:
result4 = result3.na.fill(0)
#result4 = result3.fillna(0)
result4.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|
+---+------+----+-------+-------+-------+---------+-------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|     0|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|     0|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|     0|
+---+------+----+-------+-------+-------+---------+-------+-------+------+



In [48]:
result4 = result3.na.fill(0, subset=['biology'])
# result4 = result3.fillna(0, subset=['biology'])
result4.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|
+---+------+----+-------+-------+-------+---------+-------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|  null|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|  null|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|  null|
+---+------+----+-------+-------+-------+---------+-------+-------+------+



In [51]:
result4 = result3.na.drop(how='any')
# result4 = result3.dropna(how='any')
result4.show()

+---+-----+----+-------+-------+-------+---------+-------+-------+------+
| id| name|math|english|chinese|physics|chemistry|history|biology|french|
+---+-----+----+-------+-------+-------+---------+-------+-------+------+
|  0| Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  5|Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
+---+-----+----+-------+-------+-------+---------+-------+-------+------+



In [52]:
result4 = result3.na.drop(subset=['biology'])
# result4 = result3.dropna(subset=['biology'])
result4.show()

+---+-----+----+-------+-------+-------+---------+-------+-------+------+
| id| name|math|english|chinese|physics|chemistry|history|biology|french|
+---+-----+----+-------+-------+-------+---------+-------+-------+------+
|  0| Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  1|  Bob|  98|     80|     89|     95|       71|     80|     87|  null|
|  3|  Dan|  54|     68|     57|     96|       68|     57|     88|  null|
|  5|Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
+---+-----+----+-------+-------+-------+---------+-------+-------+------+



In [53]:
result4 = result3.na.drop(how='all', subset=['biology', 'french'])
# result4 = result3.dropna(how='all', subset=['biology', 'french'])
result4.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|
+---+------+----+-------+-------+-------+---------+-------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|  null|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|  null|
|  2|Cherry|  73|     85|     86|     88|       85|     91|   null|    86|
+---+------+----+-------+-------+-------+---------+-------+-------+------+



In [54]:
import pyspark.sql.functions as F

In [58]:
result5 = result3.na.fill(0)

In [59]:
result5 = result5.withColumn(
    'average',
    (
        F.col('math') + F.col('english') + F.col('chinese')
        + F.col('physics') + F.col('chemistry') + F.col('history')
        + F.col('biology') + F.col('french')
    )/8
)

In [60]:
result5.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|average|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|  81.75|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|  97.25|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|     0|   75.0|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|     0|   61.0|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|  74.25|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|     0|   57.5|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+



In [75]:
result6 = result5.withColumn(
    'rating',
    F.when(F.col('average') >= 90, 'A').otherwise(
        F.when(F.col('average') >= 80, 'B').otherwise(
            F.when(F.col('average') >= 60, 'C').otherwise(
                'D'
            )
        )
    )
)

In [87]:
result6.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|average|rating|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|  81.75|     B|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|  97.25|     A|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|     0|   75.0|     C|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|     0|   61.0|     C|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|  74.25|     C|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|     0|   57.5|     D|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+



In [88]:
result6.groupBy('rating').count().show()

+------+-----+
|rating|count|
+------+-----+
|     B|    1|
|     A|    1|
|     C|    3|
|     D|    1|
+------+-----+



In [89]:
group_by_result = result6.groupBy('rating').count()

In [101]:
pass_rate = group_by_result.filter(F.col('rating') != 'D').agg(F.sum(F.col('count'))).collect()[0][0] \
            / group_by_result.agg(F.sum(F.col('count'))).collect()[0][0]
pass_rate

0.8333333333333334

In [102]:
print("pass rate = {}".format(round(pass_rate * 100, 2)))

pass rate = 83.33


### use average to fill null

In [113]:
french_avg = round(result3.select('french').agg(F.avg(F.col('french'))).collect()[0][0])
french_avg

87

In [114]:
biology_avg = round(result3.select('biology').agg(F.avg(F.col('biology'))).collect()[0][0])
biology_avg

90

In [110]:
result3.toPandas()

Unnamed: 0,id,name,math,english,chinese,physics,chemistry,history,biology,french
0,0,Alex,95,90,79,86,67,73,89.0,75.0
1,5,Flynn,98,97,99,96,95,99,95.0,99.0
2,1,Bob,98,80,89,95,71,80,87.0,
3,3,Dan,54,68,57,96,68,57,88.0,
4,2,Cherry,73,85,86,88,85,91,,86.0
5,4,Ethan,68,65,86,68,95,78,,


In [116]:
result4_fill_na = result3.na.fill({'biology': biology_avg, 'french': french_avg})
result4_fill_na.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|
+---+------+----+-------+-------+-------+---------+-------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|    87|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|    87|
|  2|Cherry|  73|     85|     86|     88|       85|     91|     90|    86|
|  4| Ethan|  68|     65|     86|     68|       95|     78|     90|    87|
+---+------+----+-------+-------+-------+---------+-------+-------+------+



### Spark filters

In [117]:
result6.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|average|rating|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|  81.75|     B|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|  97.25|     A|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|     0|   75.0|     C|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|     0|   61.0|     C|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|  74.25|     C|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|     0|   57.5|     D|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+



In [118]:
result6.filter(F.col('rating') == 'A').show()

+---+-----+----+-------+-------+-------+---------+-------+-------+------+-------+------+
| id| name|math|english|chinese|physics|chemistry|history|biology|french|average|rating|
+---+-----+----+-------+-------+-------+---------+-------+-------+------+-------+------+
|  5|Flynn|  98|     97|     99|     96|       95|     99|     95|    99|  97.25|     A|
+---+-----+----+-------+-------+-------+---------+-------+-------+------+-------+------+



In [119]:
result6.where("math < 60").show()

+---+----+----+-------+-------+-------+---------+-------+-------+------+-------+------+
| id|name|math|english|chinese|physics|chemistry|history|biology|french|average|rating|
+---+----+----+-------+-------+-------+---------+-------+-------+------+-------+------+
|  3| Dan|  54|     68|     57|     96|       68|     57|     88|     0|   61.0|     C|
+---+----+----+-------+-------+-------+---------+-------+-------+------+-------+------+



#### Save spark dataframe as csv file

In [45]:
# result in a collection of PARTITIONED csv files saved
result6.write.save('./data/result6/', format='csv', header=True, mode='overwrite')
#result6.write.csv('./data/result6/', header=True, mode='overwrite')

In [47]:
# save into 1 partition of csv file
result6.coalesce(1).write.save(
    './data/result6_coalesce/', format='csv', header=True, mode='overwrite'
)  
# save into 2 partitions
# result6.coalesce(2).write.save(
#     './data/result6_coalesce/', format='csv', header=True, mode='overwrite'
# )

In [123]:
!cat ./data/result6_coalesce/*.csv

id,name,math,english,chinese,physics,chemistry,history,biology,french,average,rating
0,Alex,95,90,79,86,67,73,89,75,81.75,B
5,Flynn,98,97,99,96,95,99,95,99,97.25,A
1,Bob,98,80,89,95,71,80,87,0,75.0,C
3,Dan,54,68,57,96,68,57,88,0,61.0,C
2,Cherry,73,85,86,88,85,91,0,86,74.25,C
4,Ethan,68,65,86,68,95,78,0,0,57.5,D


In [51]:
df = spark.read.csv('./data/result6_coalesce/', header = True, sep = ',', inferSchema = True)

In [7]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- english: integer (nullable = true)
 |-- chinese: integer (nullable = true)
 |-- physics: integer (nullable = true)
 |-- chemistry: integer (nullable = true)
 |-- history: integer (nullable = true)
 |-- biology: integer (nullable = true)
 |-- french: integer (nullable = true)
 |-- average: double (nullable = true)
 |-- rating: string (nullable = true)



In [8]:
df.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|average|rating|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|  81.75|     B|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|  97.25|     A|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|     0|   75.0|     C|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|     0|   61.0|     C|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|  74.25|     C|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|     0|   57.5|     D|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+



In [9]:
df.dtypes

[('id', 'int'),
 ('name', 'string'),
 ('math', 'int'),
 ('english', 'int'),
 ('chinese', 'int'),
 ('physics', 'int'),
 ('chemistry', 'int'),
 ('history', 'int'),
 ('biology', 'int'),
 ('french', 'int'),
 ('average', 'double'),
 ('rating', 'string')]

In [10]:
df.createOrReplaceTempView('result')

In [11]:
sql_df = spark.sql("""SELECT * FROM result""")
sql_df.show()

+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
| id|  name|math|english|chinese|physics|chemistry|history|biology|french|average|rating|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+
|  0|  Alex|  95|     90|     79|     86|       67|     73|     89|    75|  81.75|     B|
|  5| Flynn|  98|     97|     99|     96|       95|     99|     95|    99|  97.25|     A|
|  1|   Bob|  98|     80|     89|     95|       71|     80|     87|     0|   75.0|     C|
|  3|   Dan|  54|     68|     57|     96|       68|     57|     88|     0|   61.0|     C|
|  2|Cherry|  73|     85|     86|     88|       85|     91|      0|    86|  74.25|     C|
|  4| Ethan|  68|     65|     86|     68|       95|     78|      0|     0|   57.5|     D|
+---+------+----+-------+-------+-------+---------+-------+-------+------+-------+------+



In [12]:
spark.sql(
    """SELECT id, name, math
    FROM result
    """
).show()

+---+------+----+
| id|  name|math|
+---+------+----+
|  0|  Alex|  95|
|  5| Flynn|  98|
|  1|   Bob|  98|
|  3|   Dan|  54|
|  2|Cherry|  73|
|  4| Ethan|  68|
+---+------+----+



In [13]:
spark.sql(
    """SELECT id, name, math, average
    FROM result
    WHERE name = 'Alex'
    """
).show()

+---+----+----+-------+
| id|name|math|average|
+---+----+----+-------+
|  0|Alex|  95|  81.75|
+---+----+----+-------+

