In [1]:
! pip install pandasql

import pandas as pd
from pandasql import sqldf
from random import choice, random

major_choices = ['English', 'Science', 'Mathematics', 'Physics', 'Computer Science', 'History', 'Chemistry', 'Biology', 'Art', 'Sociology']
year_choice = [1, 2, 3, 4, 4]

major_gpa_avg = {major : random() + 2.5 for major in major_choices}

n = 100000

names = [''.join([ choice('abcdefghijklmnopqrstuvwxyz') for _ in range(7) ]) for _ in range(n)]
majors = [choice(major_choices) for _ in range(n)]
gpas = [min(major_gpa_avg[m] + random()/4.0, 4.0) for m in majors]
years = [choice(year_choice) for _ in range(n)]

student_data = pd.DataFrame( data = zip(names, gpas, majors, years), columns=['Name', 'GPA', 'Major', 'Year'] )

student_data['GPA'] = student_data.GPA.round(2)



In [2]:
statement = "SELECT * FROM student_data ORDER BY Major Limit 20;"
selected_data = sqldf(statement, globals())
print(selected_data)

       Name   GPA Major  Year
0   tbelpfj  3.66   Art     3
1   masjcil  3.51   Art     4
2   onnobnj  3.63   Art     4
3   wokpjhs  3.71   Art     3
4   yatjyac  3.70   Art     3
5   kqsmrkj  3.63   Art     1
6   roxihor  3.60   Art     2
7   axbqudp  3.73   Art     3
8   hyocwwa  3.50   Art     4
9   ufpdmti  3.50   Art     4
10  gysbrtl  3.52   Art     1
11  gmnxmsf  3.52   Art     4
12  uziqldq  3.50   Art     1
13  esthgff  3.72   Art     2
14  kyxgsjg  3.61   Art     3
15  sweugma  3.56   Art     2
16  aqsxodp  3.57   Art     4
17  bnxrhzv  3.50   Art     1
18  ggqqlwt  3.55   Art     3
19  xrmpotd  3.72   Art     1


In [3]:
statement = "SELECT * FROM student_data ORDER BY Major DESC LIMIT 20;"
selected_data = sqldf(statement, globals())
print(selected_data)

       Name   GPA      Major  Year
0   hekhpec  2.82  Sociology     4
1   rijpyoy  2.68  Sociology     1
2   plganxy  2.83  Sociology     4
3   avtnuvn  2.70  Sociology     2
4   ljkjann  2.79  Sociology     3
5   gzjetom  2.76  Sociology     3
6   siadggl  2.76  Sociology     2
7   kutonwd  2.78  Sociology     3
8   kfwqbod  2.64  Sociology     4
9   aotlkuz  2.83  Sociology     4
10  wpnhalf  2.65  Sociology     2
11  xyrrqzf  2.78  Sociology     1
12  fgngefr  2.70  Sociology     4
13  ryxjwus  2.80  Sociology     3
14  jdadvdb  2.72  Sociology     4
15  nocyyie  2.71  Sociology     3
16  btnppbj  2.77  Sociology     4
17  nnnpqyn  2.70  Sociology     3
18  nmvanac  2.72  Sociology     2
19  efelehz  2.66  Sociology     4


In [4]:
statement = "SELECT * FROM student_data ORDER BY Major, Year DESC;"
selected_data = sqldf(statement, globals())
print(selected_data)

          Name   GPA      Major  Year
0      masjcil  3.51        Art     4
1      onnobnj  3.63        Art     4
2      hyocwwa  3.50        Art     4
3      ufpdmti  3.50        Art     4
4      gmnxmsf  3.52        Art     4
...        ...   ...        ...   ...
99995  zebzpgp  2.68  Sociology     1
99996  mnanray  2.66  Sociology     1
99997  fqofdsq  2.81  Sociology     1
99998  fshznbd  2.66  Sociology     1
99999  xflluwy  2.63  Sociology     1

[100000 rows x 4 columns]


In [5]:
statement = "SELECT Major, Avg(GPA) as AvgGPA FROM student_data GROUP BY Major;"
selected_data = sqldf(statement, globals())
print(selected_data)

              Major    AvgGPA
0               Art  3.611142
1           Biology  3.002875
2         Chemistry  3.001141
3  Computer Science  2.644006
4           English  3.374307
5           History  3.293843
6       Mathematics  3.477675
7           Physics  3.503188
8           Science  2.633813
9         Sociology  2.732133


In [6]:
statement = "SELECT Major, MIN(GPA) as MinGPA FROM student_data WHERE Year = 3 GROUP BY Major;"
selected_data = sqldf(statement, globals())
print(selected_data)

              Major  MinGPA
0               Art    3.49
1           Biology    2.88
2         Chemistry    2.88
3  Computer Science    2.52
4           English    3.25
5           History    3.17
6       Mathematics    3.35
7           Physics    3.38
8           Science    2.51
9         Sociology    2.61


In [7]:
statement = "SELECT Major, Year, AVG(GPA) as AvgGPA FROM student_data WHERE Year >= 3 GROUP BY Major, Year;"
selected_data = sqldf(statement, globals())
print(selected_data)

               Major  Year    AvgGPA
0                Art     3  3.610145
1                Art     4  3.613953
2            Biology     3  3.001687
3            Biology     4  3.002830
4          Chemistry     3  2.999527
5          Chemistry     4  3.003019
6   Computer Science     3  2.644552
7   Computer Science     4  2.644120
8            English     3  3.376197
9            English     4  3.374515
10           History     3  3.294803
11           History     4  3.292507
12       Mathematics     3  3.478296
13       Mathematics     4  3.475920
14           Physics     3  3.503335
15           Physics     4  3.502725
16           Science     3  2.632500
17           Science     4  2.633963
18         Sociology     3  2.730601
19         Sociology     4  2.732836


In [9]:
statement = "SELECT SUBSTR(Name, 1, 1) as FirstLetter, AVG(GPA) FROM student_data GROUP BY FirstLetter;"
selected_data = sqldf(statement, globals())
print(selected_data)

   FirstLetter  AVG(GPA)
0            a  3.121215
1            b  3.124809
2            c  3.138337
3            d  3.136054
4            e  3.119924
5            f  3.121698
6            g  3.120863
7            h  3.130439
8            i  3.132341
9            j  3.121496
10           k  3.118708
11           l  3.125628
12           m  3.123522
13           n  3.126259
14           o  3.135863
15           p  3.120087
16           q  3.133075
17           r  3.124462
18           s  3.129701
19           t  3.120087
20           u  3.126866
21           v  3.122729
22           w  3.122514
23           x  3.126672
24           y  3.122603
25           z  3.133454
