In [57]:
!pip install pyspark



In [70]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Students').getOrCreate()

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window



In [72]:
df = spark.sparkContext.textFile('drive/My Drive/ABD/students.csv')

In [73]:
df = spark.read.csv('drive/My Drive/ABD/students.csv', header = True , inferSchema = True)
df.show()
df.printSchema()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [63]:
# 1. Number of male and female students
df.select('gender').groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|female|  518|
|  male|  482|
+------+-----+



In [64]:
#2.List different 'race/ethnicity
df.select('race/ethnicity').groupBy('race/ethnicity').count().show()

+--------------+-----+
|race/ethnicity|count|
+--------------+-----+
|       group B|  190|
|       group C|  319|
|       group D|  262|
|       group A|   89|
|       group E|  140|
+--------------+-----+



In [65]:
#3. What are different 'parental level of education'?
df.select('parental level of education').distinct().show()

+---------------------------+
|parental level of education|
+---------------------------+
|           some high school|
|         associate's degree|
|                high school|
|          bachelor's degree|
|            master's degree|
|               some college|
+---------------------------+



In [66]:
#4 How many female students scored more than 79 marks in math, whose parental level of education is 'high school'?
df.filter((col('gender')=='female') & (col('math score')>79) & (col('parental level of education')=='high school')).show()

+------+--------------+---------------------------+--------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|   lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+--------+-----------------------+----------+-------------+-------------+
|female|       group B|                high school|standard|                   none|        87|           95|           86|
|female|       group E|                high school|standard|                   none|        99|           93|           90|
|female|       group D|                high school|standard|              completed|        88|           99|          100|
|female|       group B|                high school|standard|                   none|        81|           91|           89|
|female|       group C|                high school|standard|                   none|        81|           84|           82|
+------+

In [95]:
#5. Check whether average maths score of male or female students are high?

female = df.filter(col('gender')=='female')\
.select(avg('math score')).show()

male = df.filter(col('gender')=='male')\
.select(avg('math score')).show()

+------------------+
|   avg(math score)|
+------------------+
|63.633204633204635|
+------------------+

+-----------------+
|  avg(math score)|
+-----------------+
|68.72821576763485|
+-----------------+



In [85]:
# 6  What is average reading score of male and female students?

df.groupBy('gender').agg(avg('reading score')).show()

+------+------------------+
|gender|avg(reading score)|
+------+------------------+
|female| 72.60810810810811|
|  male| 65.47302904564316|
+------+------------------+



In [90]:
#7 Whether students score depends upon 'parental level of education'? Justifyyour answer.
df.groupBy('parental level of education').agg(avg('math score')
,(avg('reading score'))
,(avg('writing score'))).show()





+---------------------------+------------------+------------------+------------------+
|parental level of education|   avg(math score)|avg(reading score)|avg(writing score)|
+---------------------------+------------------+------------------+------------------+
|           some high school|63.497206703910614| 66.93854748603351| 64.88826815642459|
|         associate's degree| 67.88288288288288| 70.92792792792793|  69.8963963963964|
|                high school| 62.13775510204081| 64.70408163265306| 62.44897959183673|
|          bachelor's degree| 69.38983050847457|              73.0| 73.38135593220339|
|            master's degree|  69.7457627118644| 75.37288135593221| 75.67796610169492|
|               some college|  67.1283185840708| 69.46017699115045| 68.84070796460178|
+---------------------------+------------------+------------------+------------------+



In [91]:
#List the records where 'test preparation course' is 'none' and scored more than 70 in maths.
df.filter((col('test preparation course')=='none') & (col('math score')>70)).show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group C|               some college|    standard|                   none|        76|           78|           75|
|female|       group B|         associate's degree|    standard|                   none|        71|           83|           78|
|  male|       group C|                high school|    standard|                   none|        88|     