# Pandas Tutorial

## Imports

In [1]:
import numpy as np
import pandas as pd

## Basics

In [3]:
df = pd.read_csv('./dataset/dataset.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,attributes,credits,crn,levels,meeting_days,meeting_instructors,meeting_place,meeting_sched_type,meeting_time,meeting_type,number,section,subject,title
0,0,Lower Division,0.0,56742,Graduate|Professional|Undergraduate,T,Karen Marais,Wilmeth Active Learning Center 1018,Lecture,3:30 pm - 4:20 pm,Class,AAE 20000,1,AAE,Undergraduate Sophomore Seminar
1,1,Lower Division,3.0,67031,Graduate|Professional|Undergraduate,TR,Shaoshuai Mou,Physics Building 114,Lecture,4:30 pm - 5:45 pm,Class,AAE 20300,2,AAE,Aeromechanics I
2,2,Lower Division,3.0,13363,Graduate|Professional|Undergraduate,MWF,Smriti Nandan Paul,Wilmeth Active Learning Center 1055,Lecture,7:30 am - 8:20 am,Class,AAE 20300,3,AAE,Aeromechanics I
3,3,Lower Division,3.0,10002,Graduate|Professional|Undergraduate,MWF,Ritwik Bandyopadhyay,Wetherill Lab of Chemistry 172,Lecture,8:30 am - 9:20 am,Class,AAE 20400,1,AAE,Aeromechanics II
4,4,Lower Division,1.0,10006,Graduate|Professional|Undergraduate,W,Ricardo Jose Gomez|Waterloo Tsutsui,Neil Armstrong Hall of Engr 3106,Laboratory,11:30 am - 1:20 pm,Class,AAE 20401,1,AAE,Aeromechanics II Laboratory


In [4]:
df.columns

Index(['Unnamed: 0', 'attributes', 'credits', 'crn', 'levels', 'meeting_days',
       'meeting_instructors', 'meeting_place', 'meeting_sched_type',
       'meeting_time', 'meeting_type', 'number', 'section', 'subject',
       'title'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16807 entries, 0 to 16806
Data columns (total 15 columns):
Unnamed: 0             16807 non-null int64
attributes             12435 non-null object
credits                16807 non-null float64
crn                    16807 non-null int64
levels                 16807 non-null object
meeting_days           8704 non-null object
meeting_instructors    16353 non-null object
meeting_place          8704 non-null object
meeting_sched_type     16807 non-null object
meeting_time           8704 non-null object
meeting_type           16807 non-null object
number                 16807 non-null object
section                16807 non-null object
subject                16807 non-null object
title                  16807 non-null object
dtypes: float64(1), int64(2), object(12)
memory usage: 1.9+ MB


In [9]:
df.describe()

Unnamed: 0.1,Unnamed: 0,credits,crn
count,16807.0,16807.0,16807.0
mean,8403.0,1.550782,27950.230559
std,4851.907323,1.550806,17098.064428
min,0.0,0.0,10002.0
25%,4201.5,1.0,15643.5
50%,8403.0,1.0,21214.0
75%,12604.5,3.0,38041.5
max,16806.0,16.0,72311.0


In [12]:
df.credits.min()

0.0

### What is the subject with most sections?

In [10]:
df.subject.value_counts()

BIOL    916
ECE     781
CHM     688
ME      570
MA      567
EPCS    530
ENGL    482
CS      464
PHYS    383
NUR     374
MGMT    326
AT      306
PSY     302
CE      293
SLHS    278
COM     275
AAE     269
CHE     263
EDCI    251
BME     241
STAT    239
TECH    236
ENGR    220
CNIT    210
AD      208
AGRY    201
MSE     199
CGT     196
EAPS    187
EDPS    186
       ... 
ASTR     15
MFET     15
ILS      13
CEM      13
MSL      13
ARAB     12
CLPH     11
NUPH     11
FVS      11
LATN     11
AAS      10
CLCS      9
SFS       8
NS        8
IDIS      8
CAND      6
SYS       6
LALS      6
PTGS      6
SCI       6
GREK      4
HEBR      4
HSOP      4
PTEC      4
ASAM      3
REL       3
JWST      3
GSLA      3
CDIS      1
MARS      1
Name: subject, Length: 137, dtype: int64

### What is hardest class (by credit)?

In [13]:
df.sort_values(by='credits', ascending=False).head()

Unnamed: 0.1,Unnamed: 0,attributes,credits,crn,levels,meeting_days,meeting_instructors,meeting_place,meeting_sched_type,meeting_time,meeting_type,number,section,subject,title
7694,7694,Credit By Exam|Full-Time Privileges|Student Te...,16.0,16555,Graduate|Professional|Undergraduate,,Jennifer Elaine Smith|Kharon D Grimmet,,Experiential,,Class,EDPS 49800,004,EDPS,Spvsd Tch Mld & Intense Dis
7692,7692,Credit By Exam|Full-Time Privileges|Student Te...,16.0,16547,Graduate|Professional|Undergraduate,,Kharon D Grimmet|Jennifer Elaine Smith,,Experiential,,Class,EDPS 49800,002,EDPS,Spvsd Tch Mild Dis
10007,10007,Upper Division|Variable Title,16.0,13624,Graduate|Professional|Undergraduate,,Megan L Purcell,,Experiential,,Class,HDFS 45000,002,HDFS,Supervised Teaching In Inclusive Programs For ...
7485,7485,Credit By Exam|Student Teaching|Upper Division...,16.0,16540,Graduate|Professional|Undergraduate,,Paul A Asunda,,Experiential,,Class,EDCI 49800,G01,EDCI,Supvsd Tch Tech Educ
7477,7477,Upper Division,16.0,68577,Graduate|Professional|Undergraduate,,Jennifer W Barce,,Experiential,,Class,EDCI 49600,003,EDCI,Student Teaching In The Elementary School


### How many classes have sections on saturday?

In [20]:
df[df.meeting_days == 'S'].count()

Unnamed: 0             28
attributes             22
credits                28
crn                    28
levels                 28
meeting_days           28
meeting_instructors    28
meeting_place          28
meeting_sched_type     28
meeting_time           28
meeting_type           28
number                 28
section                28
subject                28
title                  28
dtype: int64

### Which class has the most sections?

In [23]:
df.groupby('number').agg('count').sort_values('title', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,attributes,credits,crn,levels,meeting_days,meeting_instructors,meeting_place,meeting_sched_type,meeting_time,meeting_type,section,subject,title
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CHM 11500,226,226,226,226,226,226,226,226,226,226,226,226,226,226
BIOL 49400,189,189,189,189,189,0,185,0,189,0,189,189,189,189
ENGL 10600,165,165,165,165,165,147,165,147,165,147,165,165,165,165
BIOL 49900,140,140,140,140,140,0,140,0,140,0,140,140,140,140
ME 69900,124,0,124,124,124,0,124,0,124,0,124,124,124,124
ME 69800,123,0,123,123,123,0,123,0,123,0,123,123,123,123
ECE 69200,119,0,119,119,119,0,117,0,119,0,119,119,119,119
ECE 69900,118,0,118,118,118,0,118,0,118,0,118,118,118,118
ECE 69800,116,0,116,116,116,0,116,0,116,0,116,116,116,116
BIOL 29400,114,114,114,114,114,0,112,0,114,0,114,114,114,114


### More questions

* What subjects have classes on saturdays?
* What is the most popular class time?
* I need one credit hour class in CS. What can I pick?
* 

## Additional Resources

* [Pandas Tutorial](https://github.com/Yorko/mlcourse.ai/blob/master/jupyter_english/topic01_pandas_data_analysis/topic1_pandas_data_analysis.ipynb)