# Create the university network from student and lecturer data

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import datetime
import network_creation_functions as ncf
from importlib import reload

In [2]:
src = '../data/raw/Grunddaten-Simulation'

In [14]:
! ls ../data/raw/Grunddaten-Simulation

 Bedienstete_mit_DV_an_Org.csv	 Prüfungstermine.csv
 Lehrende.csv			 Räume.csv
 LV.csv				 Studiendaten.csv
 LV.xlsx			 Studierende_pro_LV.csv
 Prüfungen.csv			'Termine_mit_LV Bezug.csv'


In [44]:
semester_start = ncf.semester_start
semester_end = ncf.semester_end

## Load the relevant data

In [114]:
studies = pd.read_csv(join(src, 'Studiendaten.csv'))
studies = studies.rename(columns={
    'ST_PERSON_NR':'student_id',
    'STUDIENIDENTIFIKATOR':'study_id',
    'STUDIENBEZEICHNUNG':'study_name',
    'SEMESTERANZAHL':'term_number'
})


students = pd.read_csv(join(src, 'Studierende_pro_LV.csv'))
students = students.rename(columns={
    'ST_PERSON_NR':'student_id',
    'STP_SP_NR':'lecture_id',
    'LV_GRP_NR':'group_id'})
N = len(students['student_id'].unique())
students = students[students['student_id'].isin(studies['student_id'])]
print('there were {} unique sutdents in WiSe 2019/20'\
      .format(N))
print("dropped {} students because they don't have an assigned study"\
      .format(N - len(students['student_id'].unique())))


# all students should have a study and all students in the "study" data set 
# should exist in the "students" data set
assert len(set(students['student_id'].unique())\
           .difference(set(studies['student_id'].unique()))) == 0

lectures = pd.read_excel(join(src, 'LV.xlsx')) # throws a parser error if read as csv
lectures = lectures.rename(columns={
    'STP_SP_NR':'lecture_id',
    'STP_SP_TITEL':'lecture_name_ger',
    'STP_SP_TITEL_ENGL':'lecture_name'
})
N = len(lectures)
lectures = lectures[lectures['lecture_id'].isin(students['lecture_id'])]
print('{}/{} available lectures were visited by the students'\
      .format(len(lectures), N))

lecturers = pd.read_csv(join(src, 'Lehrende.csv'))
lecturers = lecturers.rename(columns={
    'PERSON_NR':'lecturer_id',
    'STP_SP_NR':'lecture_id',
    'LV_GRP_NR':'group_id'
})
N = len(lecturers)
lecturers = lecturers[lecturers['lecture_id'].isin(students['lecture_id'])]
print('{}/{} lecturers are active in courses that are visited by students'\
      .format(len(lecturers), N))


organisations = pd.read_csv(join(src, 'Bedienstete_mit_DV_an_Org.csv'))
organisations = organisations.rename(columns={
    'PERSON_NR':'lecturer_id',
    'ORG_NR':'organisation_id',
    'TUG_NEW.PUORG.GETNAME(A.ORG_NR)':'organisation_name'
})
# some lecturers have more than one affiliation. We keep a random affiliation
# and drop the others to keep things simple
#organisations = organisations.sample(frac=1, random_state=42)
#organisations = organisations.drop_duplicates(subset=['lecturer_id'])
print('lecturers are from {} organisations'\
      .format(len(organisations['organisation_id'].unique())))


# mapping of lecture group IDs to lecture IDs
groups = students[['lecture_id', 'group_id']].drop_duplicates().copy()
print('there are {} unique groups for the {} lectures'\
      .format(len(groups['group_id'].unique()), 
              len(groups['lecture_id'].unique())))

assert len(groups) == len(groups['group_id'].unique())


semester_start = '2019-10-01'
semester_end = '2020-02-28'
dates = pd.read_csv(join(src, 'Termine_mit_LV Bezug.csv'),
                parse_dates=['DATUM_AM', 'ZEIT_VON', 'ZEIT_BIS'], dayfirst=True)
dates = dates.rename(columns={
    'RES_NR':'room_id',
    'DATUM_AM':'date',
    'ZEIT_VON':'start_time',
    'ZEIT_BIS':'end_time',
    'STP_SP_NR':'lecture_id',
    'LV_GRP_NR':'group_id'
})
dates['start_time'] = dates['start_time'].apply(lambda x: x.time())
dates['end_time'] = dates['end_time'].apply(lambda x: x.time())

assert len(dates) == len(dates[dates['date'] >= pd.to_datetime(semester_start)])
assert len(dates) == len(dates[dates['date'] <= pd.to_datetime(semester_end)])

N = len(dates)
dates = dates[dates['lecture_id'].isin(lectures['lecture_id'])]
print('{}/{} dates are associated with lectures that are visited by the students'\
      .format(len(dates), N))

# note: manually filled in room information from rooms at KFU
rooms = pd.read_csv(join('../data/cleaned', 'Räume_cleaned.csv'), 
                    usecols=['RES_NR', 'RAUM_CODE', 'RAUM_SITZPLAETZE',
                            'QUADRATMETER', 'RAUM_GEBAEUDE_BEREICH_NAME'])
rooms = rooms.rename(columns={
    'RES_NR':'room_id',
    'RAUM_CODE':'room_code',
    'RAUM_SITZPLAETZE':'seats',
    'QUADRATMETER':'area',
    'RAUM_GEBAEUDE_BEREICH_NAME':'campus'
    
})
N = len(rooms)
rooms = rooms[rooms['room_id'].isin(dates['room_id'])]
print('{}/{} rooms are associated with lectures that are visited by the students'\
     .format(len(rooms), N))

there were 12711 unique sutdents in WiSe 2019/20
dropped 92 students because they don't have an assigned study
1801/6334 available lectures were visited by the students
10823/13212 lecturers are active in courses that are visited by students
lecturers are from 228 organisations
there are 2702 unique groups for the 1801 lectures
26010/29147 dates are associated with lectures that are visited by the students
381/2288 rooms are associated with lectures that are visited by the students


In [9]:
organisations.head(2)

Unnamed: 0,lecturer_id,organisation_id,organisation_name
0,12973,2913,Institut f�r Health Care Engineering mit Europ...
1,234,677,Institut f�r St�dtebau


In [10]:
students.head(3)

Unnamed: 0,student_id,lecture_id,group_id
0,183662,223024,255170
1,172878,226073,254793
2,193960,226598,260636


In [11]:
lectures.head(3)

Unnamed: 0,lecture_id,STP_SP_LVNR,SJ_NAME,SEMESTER_KB,lecture_name_ger,lecture_name,STP_SP_SST,STP_LV_ART_KURZ,STP_LV_ART_NAME,BETREUENDE_ORG_NR,BETREUENDE_ORG_NAME,Unnamed: 11
0,223321,710.095,2019/20,W,Bildverstehen,Image Understanding,1,KU,Konstruktions�bung,2376,Institut f�r Maschinelles Sehen und Darstellen,
1,223319,710.085,2019/20,W,AK Computer Vision,Selected Topics Computer Vision,1,KU,Konstruktions�bung,2376,Institut f�r Maschinelles Sehen und Darstellen,
2,225639,373.381,2019/20,W,Buchhaltung und Bilanzierung,Accounting and Balancing,1,UE,�bung,11072,Institut f�r Betriebswirtschaftslehre und Betr...,


In [12]:
lecturers.head(3)

Unnamed: 0,lecturer_id,lecture_id,group_id
1,23030,221435,258144
2,71025,228594,263699
3,109917,225445,257591


In [13]:
studies.head(3)

Unnamed: 0,student_id,study_id,study_name,term_number
0,163460,UB 198 407 417 01,Bachelorstudium Lehramt Sek (AB); UF Englisch;...,7
1,208510,UB 198 410 411 01,Bachelorstudium Lehramt Sek (AB); UF Geographi...,1
2,187361,UB 198 420 414 01,Bachelorstudium Lehramt Sek (AB); UF Mathemati...,6


In [14]:
groups.head(3)

Unnamed: 0,lecture_id,group_id
0,223024,255170
1,226073,254793
2,226598,260636


In [24]:
dates.head(3)

Unnamed: 0,room_id,date,start_time,end_time,lecture_id,group_id
0,27716,2019-10-25,10:15:00,11:00:00,227979,269598
1,27716,2020-01-31,10:15:00,11:00:00,227979,269598
4,1406,2019-12-17,09:00:00,12:00:00,228962,264420


In [116]:
rooms.head(3)

Unnamed: 0,room_id,room_code,seats,area,campus
10,5543,HFEG109,92.0,9,Inffeldgasse
12,1142,NA01158F,49.0,23,Alte Technik
21,485,PH02112,47.0,60,Neue Technik


## Create a sample network from a subset of students

In [50]:
# Bachelorstudium Molekularbiologie
sample_study_id = 'UF 033 665'

# Bachelorstudium Maschinenbau
sample_study_id = 'UF 033 245'

# Bachelorstudium Informatik
sample_study_id = 'UF 033 521'

In [74]:
studies.head(2)

Unnamed: 0,student_id,study_id,study_name,term_number
0,163460,UB 198 407 417 01,Bachelorstudium Lehramt Sek (AB); UF Englisch;...,7
1,208510,UB 198 410 411 01,Bachelorstudium Lehramt Sek (AB); UF Geographi...,1


In [91]:
sample_day = '2019-10-28' # a monday early in the semester
sample_dates = dates[dates['date'] == pd.to_datetime(sample_day)]

sample_student_ids, sample_students, sample_lecture_ids, sample_group_ids, \
    sample_lecturers, sample_lecturer_ids = \
    ncf.get_study_data(sample_study_id, studies, students, lecturers, groups)
print()

# Note: these are the lectures and groups that students of the sample study
# participated in. They are not necessarily part of the sample study plan

# lectures of the sample study that took place on the sample day
print('{} lectures took place on {}'\
      .format(len(sample_dates['lecture_id'].unique()), sample_day))

sample_lectures_on_sample_day_ids = set(sample_lecture_ids)\
    .intersection(set(sample_dates['lecture_id']))
print('{} of the lectures on {} were from the sample study'\
      .format(len(sample_lectures_on_sample_day_ids), sample_day))
print()

# students of the sample study that attended lectures on the sample day
students_at_uni_on_sample_day_ids = \
    sample_students[sample_students['lecture_id']\
    .isin(sample_lectures_on_sample_day_ids)]['student_id']
print('{} of the sample students attended lectures on {}'\
     .format(len(students_at_uni_on_sample_day_ids), sample_day))
print()


# groups of the sample study that took place on the sample day
print('{} groups took place on {}'\
      .format(len(sample_dates['group_id'].unique()), sample_day))

sample_groups_on_sample_day_ids = set(sample_group_ids)\
    .intersection(set(sample_dates['group_id']))
print('{} of the groups on {} were from the sample study'\
      .format(len(sample_groups_on_sample_day_ids), sample_day))
print()

# lecturers that taught groups of the sample study that took place on the
# sample day
print('{} lecturers were teaching on {}'\
      .format(len(lecturers[lecturers['group_id']\
        .isin(sample_dates['group_id'])]['lecturer_id'].unique()), sample_day))

lecturers_lecturing_on_sample_day_ids = \
    sample_lecturers[sample_lecturers['group_id']\
    .isin(sample_groups_on_sample_day_ids)]['lecturer_id'].unique()
print('{} of the lecturers were teaching groups visited by the sample students on {}'\
     .format(len(lecturers_lecturing_on_sample_day_ids), sample_day))


data for study UF 033 521 (Bachelorstudium; Informatik)
	the study has 904/12619 students
	the students participate in 472/1801 available lectures
	the lectures have 912 groups and the sample students participate in 637 of them
	the groups are taught by 652/1520 of the available lecturers

298 lectures took place on 2019-10-28
100 of the lectures on 2019-10-28 were from the sample study

1722 of the sample students attended lectures on 2019-10-28

358 groups took place on 2019-10-28
113 of the groups on 2019-10-28 were from the sample study

467 lecturers were teaching on 2019-10-28
195 of the lecturers were teaching groups visited by the sample students on 2019-10-28


In [55]:
reload(ncf)

plot_dst = '../plots/sample_networks'
network_dst = '../data/networks/sample_networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[0:7]


contact_map = {
    'student_lecturer_group':'far', 
    'student_student_group':'far',
}

for sample_day in sample_days: 
    G = ncf.create_single_day_network(sample_students, sample_lecturers, studies,
                                  organisations, dates, sample_day)
    ncf.draw_uni_network(G, sample_students, sample_lecturers, sample_day, 
                         sample_study_id, plot_dst)
    
G = ncf.create_network(sample_students, sample_lecturers, studies, organisations, 
                              dates, sample_days)

G.remove_nodes_from(list(nx.isolates(G)))
ncf.map_contacts(G, contact_map)
nx.readwrite.gpickle.write_gpickle(G, join(network_dst,'{}_{}_to_{}.bz2'\
        .format(sample_study_id.replace(' ', '-'),
                str(sample_days[0].date()), 
                str(sample_days[-1].date()))), protocol=4)

Tuesday
Wednesday
Thursday
Friday
Saturday
Sunday
Monday


<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

## Create networks for all studies individually

In [67]:
students_per_study = studies[['study_name', 'study_id', 'student_id']]\
    .groupby(['study_name', 'study_id'])\
    .count()\
    .sort_values(by=['student_id'], ascending=False)\
    .reset_index()\
    .rename(columns={'student_id':'student_count'})

students_per_study = students_per_study[students_per_study['student_count'] >= 20]
students_per_study.head(10)

Unnamed: 0,study_name,study_id,student_count
0,Bachelorstudium; Informatik,UF 033 521,1023
1,Bachelorstudium; Maschinenbau,UF 033 245,967
2,Bachelorstudium; Biomedical Engineering,UF 033 253,862
3,Bachelorstudium; Molekularbiologie,UB 033 665,854
4,Bachelorstudium; Bauingenieurwiss.u.Wirtschaft...,UF 033 264,849
5,Bachelorstudium; Architektur,UF 033 243,808
6,Bachelorstudium; Elektrotechnik,UF 033 235,766
7,Bachelorstudium; Wirtschaftsingenieurwesen-Mas...,UF 033 282,738
8,Bachelorstudium; Physik,UF 033 678,661
9,Bachelorstudium; Information and Computer Engi...,UF 033 211,653


In [55]:
reload(ncf)

plot_dst = '../plots/sample_networks'
network_dst = '../data/networks/sample_networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[0:7]

contact_map = {
    'student_lecturer_group':'far', 
    'student_student_group':'far',
}

for study in students_per_study['study_id']:
    for sample_day in sample_days: 
        G = ncf.create_single_day_network(sample_students, sample_lecturers, studies,
                                      organisations, dates, sample_day)
        ncf.draw_uni_network(G, sample_students, sample_lecturers, sample_day, 
                             sample_study_id, plot_dst)

    G = ncf.create_network(sample_students, sample_lecturers, studies, organisations, 
                                  dates, sample_days)

    G.remove_nodes_from(list(nx.isolates(G)))
    ncf.map_contacts(G, contact_map)
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,'{}_{}_to_{}.bz2'\
            .format(sample_study_id.replace(' ', '-'),
                    str(sample_days[0].date()), 
                    str(sample_days[-1].date()))), protocol=4)

Tuesday
Wednesday
Thursday
Friday
Saturday
Sunday
Monday


<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

## Create full university network


In [117]:
reload(ncf)

plot_dst = '../plots/sample_networks'
network_dst = '../data/networks/sample_networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[0:7]


contact_map = {
    'student_lecturer_group':'far', 
    'student_student_group':'far',
}

# create the full contact network for the whole university
G = ncf.create_network(students, lecturers, studies, organisations, 
                              dates, sample_days)

# remove disconnected
N = len(G.nodes())
G.remove_nodes_from(list(nx.isolates(G)))
print('removed {} disconnected nodes'.format(N - len(G.nodes())))

# map link types to contact strengths
ncf.map_contacts(G, contact_map)

# save the graph
nx.readwrite.gpickle.write_gpickle(G, join(network_dst,'university_{}_to_{}.bz2'\
        .format(str(sample_days[0].date()), 
                str(sample_days[-1].date()))), protocol=4)

removed 1785 disconnected nodes


### Network statistics

In [124]:
N_students = len([n for n in G.nodes(data='type') if n[1] == 'unistudent'])
N_lecturers = len([n for n in G.nodes(data='type') if n[1] == 'lecturer'])

In [126]:
N_students

11494

In [127]:
N_lecturers

839

In [128]:
N_students / (N_students + N_lecturers)

0.9319711343549826

In [129]:
N_lecturers / (N_students + N_lecturers)

0.06802886564501744

## Create university networks with reduced lecture sizes

In [144]:
reload(ncf)
fractions = [0.25, 0.5, 1.0]
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[0:7]

contact_map = {
    'student_lecturer_group':'far', 
    'student_student_group':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(students, lecturers, studies, organisations, 
                                  dates, sample_days, half=True, frac=frac)

    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print('removed {} disconnected nodes'.format(N - len(G.nodes())))

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

removed 1785 disconnected nodes
removed 1785 disconnected nodes
removed 1785 disconnected nodes


### Network statistics

In [133]:
for frac in fractions:
    G = nx.readwrite.gpickle.read_gpickle(join(network_dst, 
        'university_{}_to_{}_fraction-{}.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)))
    
    N_students = len([n for n in G.nodes(data='type') if n[1] == 'unistudent'])
    N_lecturers = len([n for n in G.nodes(data='type') if n[1] == 'lecturer'])
    
    print('lecturers: {}, students: {}'.format(N_lecturers, N_students))

lecturers: 829, students: 1085
lecturers: 829, students: 1079
lecturers: 830, students: 1079


In [128]:
N_students / (N_students + N_lecturers)

0.9319711343549826

In [129]:
N_lecturers / (N_students + N_lecturers)

0.06802886564501744