
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/data/Student_performance_data__.csv"
file_type = "csv"

# The applied options are for CSV files. For other file types, these will be ignored.
student_df = spark.read.csv(file_location, header=True, inferSchema=True)

student_df.show(truncate=False)

+---------+---+------+---------+-----------------+------------------+--------+--------+---------------+---------------+------+-----+------------+------------------+----------+
|StudentID|Age|Gender|Ethnicity|ParentalEducation|StudyTimeWeekly   |Absences|Tutoring|ParentalSupport|Extracurricular|Sports|Music|Volunteering|GPA               |GradeClass|
+---------+---+------+---------+-----------------+------------------+--------+--------+---------------+---------------+------+-----+------------+------------------+----------+
|1001     |17 |1     |0        |2                |19.833722807854713|7       |1       |2              |0              |0     |1    |0           |2.929195591667681 |2.0       |
|1002     |18 |0     |0        |1                |15.40875605584674 |0       |0       |1              |0              |0     |0    |0           |3.042914833436377 |1.0       |
|1003     |15 |0     |2        |3                |4.21056976881226  |26      |0       |2              |0              |0

In [0]:
student_df.columns

Out[2]: ['StudentID',
 'Age',
 'Gender',
 'Ethnicity',
 'ParentalEducation',
 'StudyTimeWeekly',
 'Absences',
 'Tutoring',
 'ParentalSupport',
 'Extracurricular',
 'Sports',
 'Music',
 'Volunteering',
 'GPA',
 'GradeClass']

In [0]:
#check column types
student_df.dtypes

Out[3]: [('StudentID', 'int'),
 ('Age', 'int'),
 ('Gender', 'int'),
 ('Ethnicity', 'int'),
 ('ParentalEducation', 'int'),
 ('StudyTimeWeekly', 'double'),
 ('Absences', 'int'),
 ('Tutoring', 'int'),
 ('ParentalSupport', 'int'),
 ('Extracurricular', 'int'),
 ('Sports', 'int'),
 ('Music', 'int'),
 ('Volunteering', 'int'),
 ('GPA', 'double'),
 ('GradeClass', 'double')]

In [0]:
#another way to get column types
student_df.printSchema()

root
 |-- StudentID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Ethnicity: integer (nullable = true)
 |-- ParentalEducation: integer (nullable = true)
 |-- StudyTimeWeekly: double (nullable = true)
 |-- Absences: integer (nullable = true)
 |-- Tutoring: integer (nullable = true)
 |-- ParentalSupport: integer (nullable = true)
 |-- Extracurricular: integer (nullable = true)
 |-- Sports: integer (nullable = true)
 |-- Music: integer (nullable = true)
 |-- Volunteering: integer (nullable = true)
 |-- GPA: double (nullable = true)
 |-- GradeClass: double (nullable = true)



## EXPLORATORY ANALYSIS

In [0]:
#quick statistical summary of the data

display(student_df.describe())

summary,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645484949832,0.5108695652173914,0.8775083612040134,1.74623745819398,9.771991919431748,14.54138795986622,0.3014214046822742,2.1220735785953178,0.3833612040133779,0.3035117056856187,0.1969063545150501,0.157190635451505,1.9061863027265409,2.983695652173913
stddev,690.6552444357942,1.1237983798555546,0.4999863617146026,1.028475775850354,1.0004110692382984,5.65277423586026,8.467417379917379,0.4589712494146483,1.1228128542119156,0.4863067551542234,0.4598703750919135,0.3977441129029819,0.3640565177064974,0.9151558203249596,1.233907560226087
min,1001.0,15.0,0.0,0.0,0.0,0.001056538645936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3392.0,18.0,1.0,3.0,4.0,19.97809399526153,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


In [0]:
student_df.agg({"Absences": "sum"}).show()

+-------------+
|sum(Absences)|
+-------------+
|        34783|
+-------------+



In [0]:
#absentees by gender

student_df.groupby(["Gender"]).sum("Absences").show()

+------+-------------+
|Gender|sum(Absences)|
+------+-------------+
|     1|        17987|
|     0|        16796|
+------+-------------+



In [0]:
#absentees by age

display(student_df.groupby(['Age']).sum("Absences"))

Age,sum(Absences)
16,8781
15,9167
17,8464
18,8371


In [0]:
#average grade by gender

student_df.groupBy("Gender").avg("GradeClass").show()

+------+-----------------+
|Gender|  avg(GradeClass)|
+------+-----------------+
|     1|3.011456628477905|
|     0|2.954700854700855|
+------+-----------------+



In [0]:
#distribution of grades

student_df.groupby("GradeClass").count().orderBy("count", ascending=False).show()

+----------+-----+
|GradeClass|count|
+----------+-----+
|       4.0| 1211|
|       3.0|  414|
|       2.0|  391|
|       1.0|  269|
|       0.0|  107|
+----------+-----+



In [0]:
#grouping by age and aggregating by multiple columns

student_df.groupby("Age").agg({"StudyTimeWeekly": "avg", "Volunteering": "sum", "Extracurricular": "avg"}).show()

+---+--------------------+-----------------+--------------------+
|Age|avg(StudyTimeWeekly)|sum(Volunteering)|avg(Extracurricular)|
+---+--------------------+-----------------+--------------------+
| 16|   9.852193787506788|               80|   0.403035413153457|
| 15|   9.837933572428415|              102|  0.3904761904761905|
| 17|   9.573587542323894|               97| 0.37649063032367974|
| 18|   9.819002950713351|               97| 0.36254295532646047|
+---+--------------------+-----------------+--------------------+



In [0]:
#grouping by age and aggregating by multiple columns..USING Alias
from pyspark.sql.functions import avg

student_df.groupby("Age").agg(avg("StudyTimeWeekly").alias("Average Study Time"),
                              sum("Volunteering").alias("Total Charity Time"),
                              avg("Extracurricular").alias("Average Extracurricular Activities"))\
                                  .orderBy("Average Study Time", ascending=False).show()

+---+------------------+------------------+----------------------------------+
|Age|Average Study Time|Total Charity Time|Average Extracurricular Activities|
+---+------------------+------------------+----------------------------------+
| 16| 9.852193787506788|                80|                 0.403035413153457|
| 15| 9.837933572428415|               102|                0.3904761904761905|
| 18| 9.819002950713351|                97|               0.36254295532646047|
| 17| 9.573587542323894|                97|               0.37649063032367974|
+---+------------------+------------------+----------------------------------+



In [0]:
student_df.groupby("GPA").count().orderBy("count", ascending=False).show()

+------------------+-----+
|               GPA|count|
+------------------+-----+
|               0.0|   16|
|               4.0|    7|
|3.2839165218973836|    1|
|1.5504493308923313|    1|
| 1.011253194616751|    1|
|1.4444766951837589|    1|
| 2.917010983412451|    1|
|1.1509661314441684|    1|
| 0.741527556169066|    1|
|0.7695898642280781|    1|
|2.4629431182407853|    1|
|1.2656779091994341|    1|
|1.5258378621561266|    1|
| 2.396139095071558|    1|
|0.5177931283748711|    1|
|1.5743060882588613|    1|
|0.9430459197705603|    1|
| 2.512488758731327|    1|
|2.4571875145361584|    1|
|1.7333640465600053|    1|
+------------------+-----+
only showing top 20 rows



In [0]:
from pyspark.sql.functions import when, col
#Categorising students by their GPA
#creating new column -- when/otherwise


passed_failed = student_df.withColumn("GPA_group", when(col("GPA")>3.4, "passed").otherwise("Failed"))

passed_failed.select(["StudentID", "Age", "GPA", "GPA_group"]).show()

+---------+---+------------------+---------+
|StudentID|Age|               GPA|GPA_group|
+---------+---+------------------+---------+
|     1001| 17| 2.929195591667681|   Failed|
|     1002| 18| 3.042914833436377|   Failed|
|     1003| 15|0.1126022544661815|   Failed|
|     1004| 17|2.0542181397029484|   Failed|
|     1005| 17|1.2880611817953875|   Failed|
|     1006| 18|3.0841836144863937|   Failed|
|     1007| 15| 2.748237414891583|   Failed|
|     1008| 15| 1.360142712316461|   Failed|
|     1009| 17| 2.896819189513569|   Failed|
|     1010| 16|3.5734742103297656|   passed|
|     1011| 17|2.1471716250185144|   Failed|
|     1012| 17|1.5595945190402773|   Failed|
|     1013| 17| 1.520077814874808|   Failed|
|     1014| 17|1.7515809583340785|   Failed|
|     1015| 18| 2.396788117124796|   Failed|
|     1016| 15|1.3415207165346672|   Failed|
|     1017| 18|2.2321752777159762|   Failed|
|     1018| 18|1.3844041756940335|   Failed|
|     1019| 18|0.4695533233798704|   Failed|
|     1020

In [0]:
#count of null values in each column
null_counts = student_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in student_df.columns
])

null_counts.show()


+---------+---+------+---------+-----------------+---------------+--------+--------+---------------+---------------+------+-----+------------+---+----------+
|StudentID|Age|Gender|Ethnicity|ParentalEducation|StudyTimeWeekly|Absences|Tutoring|ParentalSupport|Extracurricular|Sports|Music|Volunteering|GPA|GradeClass|
+---------+---+------+---------+-----------------+---------------+--------+--------+---------------+---------------+------+-----+------------+---+----------+
|        0|  0|     0|        0|                0|              0|       0|       0|              0|              0|     0|    0|           0|  0|         0|
+---------+---+------+---------+-----------------+---------------+--------+--------+---------------+---------------+------+-----+------------+---+----------+



In [0]:
#Drop Columns in null_count df

null_counts.drop('ParentalEducation', "Volunteering", "StudyTimeWeekly").show()

+---------+---+------+---------+--------+--------+---------------+---------------+------+-----+---+----------+
|StudentID|Age|Gender|Ethnicity|Absences|Tutoring|ParentalSupport|Extracurricular|Sports|Music|GPA|GradeClass|
+---------+---+------+---------+--------+--------+---------------+---------------+------+-----+---+----------+
|        0|  0|     0|        0|       0|       0|              0|              0|     0|    0|  0|         0|
+---------+---+------+---------+--------+--------+---------------+---------------+------+-----+---+----------+



In [0]:
type(null_counts)

Out[63]: pyspark.sql.dataframe.DataFrame