In [1]:
from lxml import etree
import re
import traceback

In [2]:
FULL_PAGE = ""
with open('./Class Schedule Listing.html') as f:
    FULL_PAGE = f.read()

In [3]:
parser = etree.HTMLParser()
tree = etree.fromstring(FULL_PAGE, parser)

In [4]:
tbody = tree.xpath('/html/body/div[3]/table[1]/tbody')[0]

In [5]:
class_head = tbody.xpath('tr[1]')[0]
class_body = tbody.xpath('tr[2]')[0]

In [46]:
def parse_class_head(class_head):
    text = class_head.xpath('th/a[1]/text()')[0]
    url = class_head.xpath('th/a[1]/@href')[0]
    
    *title, crn, number, section = text.split('-')
    title = '-'.join(title).strip()
    crn = int(crn)
    number = number.strip()
    section = section.strip()
    
    return {
        'title': title,
        'subject': number.split(' ')[0].strip(),
        'crn': crn,
        'number': number,
        'section': section
    }

In [47]:
def parse_class_body(class_body):
    levels = class_body.xpath('td/span[text()="Levels: "]/following-sibling::text()[1]')[0]
    levels = levels.split(',')
    levels = list(map(lambda x: x.strip(), levels))
    
    try:
        attributes = class_body.xpath('td/span[text()="Attributes: "]/following-sibling::text()[1]')[0]
        attributes = attributes.split(',')
        attributes = list(map(lambda x: x.strip(), attributes))
    except IndexError:
        attributes = []
    
    credits = class_body.xpath("td/text()[contains(., 'Credits')]")[0].strip().split()[0]
    credits
#     assert len(class_body.xpath("td/table/tbody/tr")) == 2
    meeting = class_body.xpath("td/table/tbody/tr[2]")[0]
    meeting_type = meeting.xpath("td[1]/text()")[0]
    
    try:
        meeting_time = meeting.xpath("td[2]/text()")[0]
        meeting_days = meeting.xpath("td[3]/text()")[0]
        meeting_place = meeting.xpath("td[4]/text()")[0]
    except IndexError:
        meeting_time = ""
        meeting_days = ""
        meeting_place = ""

    meeting_sched_type = meeting.xpath("td[6]/text()")[0]
    
    profs = (','.join(meeting.xpath("td[7]/text()"))).replace('(,)', ',').split(',')
    profs = filter(lambda x: len(x) > 0, profs)
    profs = list(map(lambda x: re.sub(r'\s+', ' ', x).strip(), profs))
    
    meeting_instructors = profs
    
    return {
        'levels': '|'.join(levels),
        'attributes': '|'.join(attributes),
        'credits': credits,
        'meeting_type': meeting_type,
        'meeting_time': meeting_time,
        'meeting_days': meeting_days,
        'meeting_place': meeting_place,
        'meeting_sched_type': meeting_sched_type,
        'meeting_instructors': '|'.join(meeting_instructors)
    }

In [48]:
def gen_dataset(tree):
    tbody = tree.xpath('/html/body/div[3]/table[1]/tbody')[0]
    
    data_all = []
    
    size = len(tbody.xpath('tr')) // 2
    for i in range(1, size + 1):
        try:
            class_head = tbody.xpath(f'tr[{i * 2 - 1}]')[0]
            class_body = tbody.xpath(f'tr[{i * 2}]')[0]

            data = parse_class_head(class_head)
            data.update(parse_class_body(class_body))
            data_all.append(data)
        except Exception as e:
            traceback.print_exc()
            print(i * 2 - 1)
            return
        
    return data_all

In [49]:
data = gen_dataset(tree)

In [126]:
a = tbody.xpath('tr[164]')[0]
# parse_class_body(a)
# parse_class_head(a)
print(etree.tostring(a, pretty_print=True).decode())

<tr>
<td class="dddefault">
<span class="fieldlabeltext">Associated Term: </span>Fall 2019 
<br/>
<span class="fieldlabeltext">Registration Dates: </span>Mar 18, 2019 to Aug 25, 2019 
<br/>
<span class="fieldlabeltext">Levels: </span>Graduate, Professional, Undergraduate 
<br/>
<span class="fieldlabeltext">Attributes: </span>Upper Division 
<br/>
<br/>
West Lafayette Campus
<br/>
Laboratory Schedule Type
<br/>
       0.000 Credits
<br/>
<a href="https://selfservice.mypurdue.purdue.edu/prod/bwckctlg.p_display_courses?term_in=202010&amp;one_subj=AAE&amp;sel_crse_strt=41800&amp;sel_crse_end=41800&amp;sel_subj=&amp;sel_levl=&amp;sel_schd=&amp;sel_coll=&amp;sel_divs=&amp;sel_dept=&amp;sel_attr=">View Catalog Entry</a>
<br/>
<a href="https://selfservice.mypurdue.purdue.edu/prod/bwckbook.site?p_term_in=202010&amp;p_subj_in=AAE&amp;p_crse_numb_in=41800&amp;p_seq_in=002"><b>Course Materials</b><br/>

</a>
<br/>
<br/>
<table class="datadisplaytable" summary="This table lists the scheduled meetin

In [10]:
import pandas as pd

In [50]:
df = pd.DataFrame(data)

In [18]:
df.meeting_time.value_counts()

                       8103
1:30 pm - 3:20 pm       512
9:30 am - 10:20 am      462
1:30 pm - 2:20 pm       450
3:30 pm - 5:20 pm       441
10:30 am - 11:20 am     440
12:30 pm - 1:20 pm      428
11:30 am - 12:20 pm     414
3:30 pm - 4:20 pm       399
9:30 am - 11:20 am      381
2:30 pm - 3:20 pm       368
8:30 am - 9:20 am       348
11:30 am - 1:20 pm      340
4:30 pm - 5:20 pm       256
10:30 am - 11:45 am     232
1:30 pm - 2:45 pm       225
12:00 pm - 1:15 pm      225
11:30 am - 2:20 pm      202
9:00 am - 10:15 am      188
10:30 am - 12:20 pm     158
7:30 am - 9:20 am       158
3:00 pm - 4:15 pm       157
2:30 pm - 5:20 pm       155
7:30 am - 8:20 am       135
8:30 am - 11:20 am      134
4:30 pm - 5:45 pm       116
7:30 am - 10:20 am      106
8:30 am - 10:20 am      103
2:50 pm - 5:40 pm        84
1:30 pm - 4:20 pm        65
                       ... 
6:00 pm - 7:30 pm         1
9:30 am - 11:00 am        1
5:00 pm - 5:20 pm         1
2:30 pm - 4:45 pm         1
12:00 pm - 9:50 pm  

In [51]:
df.to_csv('./dataset.csv')

In [114]:
agg = df.groupby('meeting_place').agg('count')

In [117]:
agg.sort_values('title', ascending=False)

Unnamed: 0_level_0,attributes,credits,crn,levels,meeting_days,meeting_instructors,meeting_sched_type,meeting_time,meeting_type,number,section,title
meeting_place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,8103,8103,8103,8103,8103,8103,8103,8103,8103,8103,8103,8103
Neil Armstrong Hall of Engr 1098C,180,180,180,180,180,180,180,180,180,180,180,180
Neil Armstrong Hall of Engr 1101,168,168,168,168,168,168,168,168,168,168,168,168
Neil Armstrong Hall of Engr 1098B,144,144,144,144,144,144,144,144,144,144,144,144
Off Campus TRVLTIME,142,142,142,142,142,142,142,142,142,142,142,142
Electrical Engineering Bldg 013,101,101,101,101,101,101,101,101,101,101,101,101
Off Campus AIRCRAFT,70,70,70,70,70,70,70,70,70,70,70,70
Beering Hall of Lib Arts & Ed B291,50,50,50,50,50,50,50,50,50,50,50,50
Brown Laboratory of Chemistry 3104,46,46,46,46,46,46,46,46,46,46,46,46
Wetherill Lab of Chemistry 362,44,44,44,44,44,44,44,44,44,44,44,44


In [19]:
df1 = pd.read_csv('./dataset.csv')

In [36]:
def lol(s):
    try:
        return json.loads(s)
    except Exception:
        print(s)

In [37]:
df1.meeting_instructors1 = df1.meeting_instructors.apply(lol)

['Karen Marais']
['Shaoshuai Mou']
['Smriti Nandan Paul']
['Ritwik Bandyopadhyay']
['Ricardo Jose Gomez', 'Waterloo Tsutsui']
['Eli Vincent Sitchin', 'Waterloo Tsutsui']
['Ricardo Jose Gomez', 'Waterloo Tsutsui']
['Federico Rios Tascon', 'Waterloo Tsutsui']
['Ishan T Karnik', 'Waterloo Tsutsui']
['Waterloo Tsutsui']
['Ricardo Jose Gomez', 'Waterloo Tsutsui']
['Ishan T Karnik', 'Waterloo Tsutsui']
['Timothee Louis Pourpoint']
['Ali Khalid Raz']
['Tyson Joelle McFall']
['Tyson Joelle McFall']
['Karen Marais']
['Arthur E Frazho']
['Dengfeng Sun']
['Sergey O Macheret']
['Joseph S Jewell']
['Wenjie Cai', 'Sally Pm Bane']
['Bang-shiuh Chen', 'Sally Pm Bane']
['Wenjie Cai', 'Sally Pm Bane']
['Wenjie Cai', 'Sally Pm Bane']
['Nathan Daniel Ballintyn', 'Sally Pm Bane']
['Prashanth Bangalore venkatesh', 'Sally Pm Bane']
['Nathan Daniel Ballintyn', 'Sally Pm Bane']
['Sally Pm Bane']
['Wenjie Cai', 'Sally Pm Bane']
['Wenjie Cai', 'Sally Pm Bane']
['Nathan Daniel Ballintyn', 'Sally Pm Bane']
['Bang-

  """Entry point for launching an IPython kernel.


In [22]:
df.meeting_instructors

['Karen Marais']

In [28]:
df.meeting_instructors.apply(len).idxmax()

14301

In [32]:
df.iloc[14301].meeting_instructors

['Jasmine D Gonzalvo',
 'Off Campus Instructor',
 'Alex N Isaacs',
 'Emily N Israel',
 'Nicole C Jasperson',
 'Nira N Kadakia',
 'Ashley H Meredith',
 'Monica L Miller',
 'Mary Elizabeth Nolan',
 'Carol A Ott',
 'Kimberly S Plake',
 'Steven Alan Scott']

In [35]:
df.credits.apply(float).max()

16.0

In [38]:
df

Unnamed: 0,attributes,credits,crn,levels,meeting_days,meeting_instructors,meeting_place,meeting_sched_type,meeting_time,meeting_type,number,section,title
0,[Lower Division],0.000,56742,"[Graduate, Professional, Undergraduate]",T,[Karen Marais],Wilmeth Active Learning Center 1018,Lecture,3:30 pm - 4:20 pm,Class,AAE 20000,001,Undergraduate Sophomore Seminar
1,[Lower Division],3.000,67031,"[Graduate, Professional, Undergraduate]",TR,[Shaoshuai Mou],Physics Building 114,Lecture,4:30 pm - 5:45 pm,Class,AAE 20300,002,Aeromechanics I
2,[Lower Division],3.000,13363,"[Graduate, Professional, Undergraduate]",MWF,[Smriti Nandan Paul],Wilmeth Active Learning Center 1055,Lecture,7:30 am - 8:20 am,Class,AAE 20300,003,Aeromechanics I
3,[Lower Division],3.000,10002,"[Graduate, Professional, Undergraduate]",MWF,[Ritwik Bandyopadhyay],Wetherill Lab of Chemistry 172,Lecture,8:30 am - 9:20 am,Class,AAE 20400,001,Aeromechanics II
4,[Lower Division],1.000,10006,"[Graduate, Professional, Undergraduate]",W,"[Ricardo Jose Gomez, Waterloo Tsutsui]",Neil Armstrong Hall of Engr 3106,Laboratory,11:30 am - 1:20 pm,Class,AAE 20401,001,Aeromechanics II Laboratory
5,[Lower Division],1.000,10004,"[Graduate, Professional, Undergraduate]",R,"[Eli Vincent Sitchin, Waterloo Tsutsui]",Neil Armstrong Hall of Engr 3106,Laboratory,3:30 pm - 5:20 pm,Class,AAE 20401,002,Aeromechanics II Laboratory
6,[Lower Division],1.000,10003,"[Graduate, Professional, Undergraduate]",T,"[Ricardo Jose Gomez, Waterloo Tsutsui]",Neil Armstrong Hall of Engr 3106,Laboratory,7:30 am - 9:20 am,Class,AAE 20401,003,Aeromechanics II Laboratory
7,[Lower Division],1.000,10005,"[Graduate, Professional, Undergraduate]",T,"[Federico Rios Tascon, Waterloo Tsutsui]",Neil Armstrong Hall of Engr 3106,Laboratory,1:30 pm - 3:20 pm,Class,AAE 20401,004,Aeromechanics II Laboratory
8,[Lower Division],1.000,14924,"[Graduate, Professional, Undergraduate]",R,"[Ishan T Karnik, Waterloo Tsutsui]",Neil Armstrong Hall of Engr 3106,Laboratory,1:30 pm - 3:20 pm,Class,AAE 20401,006,Aeromechanics II Laboratory
9,[Lower Division],0.000,14343,"[Graduate, Professional, Undergraduate]",M,[Waterloo Tsutsui],Neil Armstrong Hall of Engr 1010,Laboratory Preparation,11:30 am - 12:20 pm,Class,AAE 20401,007,Aeromechanics II Laboratory
