In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Tutorial').getOrCreate()

In [3]:
spark

In [5]:
df = spark.read.csv('../../Data/Test_Scores.csv', header=True, inferSchema=True)
df.show()

+---+---------+-------+-------+
|ACT|FinalExam|QuizAvg|TestAvg|
+---+---------+-------+-------+
| 33|      181|     95|     89|
| 31|      169|     81|     89|
| 21|      176|     65|     68|
| 25|      181|     66|     90|
| 29|      169|     89|     81|
| 24|      103|     61|     57|
| 25|      150|     81|     76|
| 29|      147|     86|     76|
| 36|      181|     98|    102|
| 26|      163|     72|     70|
| 31|      163|     95|     81|
| 29|      147|     65|     67|
| 23|      160|     62|     68|
| 26|      100|     63|     56|
+---+---------+-------+-------+



In [6]:
type(df)

pyspark.sql.dataframe.DataFrame

In [8]:
df.show(5)

+---+---------+-------+-------+
|ACT|FinalExam|QuizAvg|TestAvg|
+---+---------+-------+-------+
| 33|      181|     95|     89|
| 31|      169|     81|     89|
| 21|      176|     65|     68|
| 25|      181|     66|     90|
| 29|      169|     89|     81|
+---+---------+-------+-------+
only showing top 5 rows



In [9]:
df.printSchema()

root
 |-- ACT: integer (nullable = true)
 |-- FinalExam: integer (nullable = true)
 |-- QuizAvg: integer (nullable = true)
 |-- TestAvg: integer (nullable = true)



In [12]:
df.select(['ACT']).show(5)

+---+
|ACT|
+---+
| 33|
| 31|
| 21|
| 25|
| 29|
+---+
only showing top 5 rows



In [18]:
df.dtypes

[('ACT', 'int'), ('FinalExam', 'int'), ('QuizAvg', 'int'), ('TestAvg', 'int')]

In [19]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+
|summary|               ACT|         FinalExam|           QuizAvg|           TestAvg|
+-------+------------------+------------------+------------------+------------------+
|  count|                14|                14|                14|                14|
|   mean|27.714285714285715|156.42857142857142| 77.07142857142857| 76.42857142857143|
| stddev| 4.158930547232461| 26.11723358454892|13.747527250182324|13.195070674683409|
|    min|                21|               100|                61|                56|
|    max|                36|               181|                98|               102|
+-------+------------------+------------------+------------------+------------------+



In [30]:
df1 = df.withColumn('KPI', df['FinalExam']*0.6+df['TestAvg']*0.3+df['QuizAvg']*0.1)
df1.show()

+---+---------+-------+-------+------------------+
|ACT|FinalExam|QuizAvg|TestAvg|               KPI|
+---+---------+-------+-------+------------------+
| 33|      181|     95|     89|144.79999999999998|
| 31|      169|     81|     89|             136.2|
| 21|      176|     65|     68|             132.5|
| 25|      181|     66|     90|             142.2|
| 29|      169|     89|     81|             134.6|
| 24|      103|     61|     57| 84.99999999999999|
| 25|      150|     81|     76|120.89999999999999|
| 29|      147|     86|     76|             119.6|
| 36|      181|     98|    102|             149.0|
| 26|      163|     72|     70|             126.0|
| 31|      163|     95|     81|             131.6|
| 29|      147|     65|     67|             114.8|
| 23|      160|     62|     68|122.60000000000001|
| 26|      100|     63|     56|              83.1|
+---+---------+-------+-------+------------------+



In [31]:
df1.drop('KPI')

DataFrame[ACT: int, FinalExam: int, QuizAvg: int, TestAvg: int]

In [32]:
df1.withColumnRenamed('KPI', 'Grade').show()

+---+---------+-------+-------+------------------+
|ACT|FinalExam|QuizAvg|TestAvg|             Grade|
+---+---------+-------+-------+------------------+
| 33|      181|     95|     89|144.79999999999998|
| 31|      169|     81|     89|             136.2|
| 21|      176|     65|     68|             132.5|
| 25|      181|     66|     90|             142.2|
| 29|      169|     89|     81|             134.6|
| 24|      103|     61|     57| 84.99999999999999|
| 25|      150|     81|     76|120.89999999999999|
| 29|      147|     86|     76|             119.6|
| 36|      181|     98|    102|             149.0|
| 26|      163|     72|     70|             126.0|
| 31|      163|     95|     81|             131.6|
| 29|      147|     65|     67|             114.8|
| 23|      160|     62|     68|122.60000000000001|
| 26|      100|     63|     56|              83.1|
+---+---------+-------+-------+------------------+



In [33]:
# drop any rows less than 2 non-null values

df.na.drop(how='any', thresh=2).show()

+---+---------+-------+-------+
|ACT|FinalExam|QuizAvg|TestAvg|
+---+---------+-------+-------+
| 33|      181|     95|     89|
| 31|      169|     81|     89|
| 21|      176|     65|     68|
| 25|      181|     66|     90|
| 29|      169|     89|     81|
| 24|      103|     61|     57|
| 25|      150|     81|     76|
| 29|      147|     86|     76|
| 36|      181|     98|    102|
| 26|      163|     72|     70|
| 31|      163|     95|     81|
| 29|      147|     65|     67|
| 23|      160|     62|     68|
| 26|      100|     63|     56|
+---+---------+-------+-------+



In [34]:
# drop any rows if ACT contains null values

df.na.drop(how='any', subset=['ACT']).show()

+---+---------+-------+-------+
|ACT|FinalExam|QuizAvg|TestAvg|
+---+---------+-------+-------+
| 33|      181|     95|     89|
| 31|      169|     81|     89|
| 21|      176|     65|     68|
| 25|      181|     66|     90|
| 29|      169|     89|     81|
| 24|      103|     61|     57|
| 25|      150|     81|     76|
| 29|      147|     86|     76|
| 36|      181|     98|    102|
| 26|      163|     72|     70|
| 31|      163|     95|     81|
| 29|      147|     65|     67|
| 23|      160|     62|     68|
| 26|      100|     63|     56|
+---+---------+-------+-------+



In [35]:
df.fillna()

DataFrame[ACT: int, FinalExam: int, QuizAvg: int, TestAvg: int]

In [44]:
 df.filter('ACT >= 30 and ACT <= 35').show()

+---+---------+-------+-------+
|ACT|FinalExam|QuizAvg|TestAvg|
+---+---------+-------+-------+
| 33|      181|     95|     89|
| 31|      169|     81|     89|
| 31|      163|     95|     81|
+---+---------+-------+-------+



In [64]:
df.groupby('ACT').mean().show()

+---+--------+------------------+------------+-----------------+
|ACT|avg(ACT)|    avg(FinalExam)|avg(QuizAvg)|     avg(TestAvg)|
+---+--------+------------------+------------+-----------------+
| 31|    31.0|             166.0|        88.0|             85.0|
| 26|    26.0|             131.5|        67.5|             63.0|
| 23|    23.0|             160.0|        62.0|             68.0|
| 25|    25.0|             165.5|        73.5|             83.0|
| 24|    24.0|             103.0|        61.0|             57.0|
| 29|    29.0|154.33333333333334|        80.0|74.66666666666667|
| 21|    21.0|             176.0|        65.0|             68.0|
| 33|    33.0|             181.0|        95.0|             89.0|
| 36|    36.0|             181.0|        98.0|            102.0|
+---+--------+------------------+------------+-----------------+



In [56]:
df.agg({'FinalExam': 'sum'}).show()

+--------------+
|sum(FinalExam)|
+--------------+
|          2190|
+--------------+

