# Create the university network from student and lecturer data

In [31]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import os
import datetime
import string
import network_creation_functions as ncf
from importlib import reload


# parallelisation functionality
from multiprocess import Pool
import psutil
from tqdm import tqdm

In [32]:
src = '../../data/clean'

In [33]:
semester_start = ncf.semester_start # date of the semester start
semester_end = ncf.semester_end # date of the semester end
study_map = ncf.study_map # mapping of studies to degree levels

## Load the relevant data

In [34]:
enrollment = pd.read_csv(join(src, 'enrollment.csv'))
supervision = pd.read_csv(join(src, 'supervision.csv'))
event_dates = pd.read_csv(join(src, 'event_dates.csv'), parse_dates=["date"])
groups = pd.read_csv(join(src, 'groups.csv'))
rooms = pd.read_csv(join(src, 'rooms.csv'))
students = pd.read_csv(join(src, 'students.csv'))
lecturers = pd.read_csv(join(src, 'lecturers.csv'))
courses = pd.read_csv(join(src, 'courses.csv'))
event_participation = pd.read_csv(join(src, 'event_participation.csv'),
                        parse_dates=["date"])
event_participation = event_participation.drop_duplicates()
event_supervision = pd.read_csv(join(src, 'event_supervision.csv'),
                        parse_dates=["date"])
event_supervision = event_supervision.drop_duplicates()

In [35]:
# map hashed student and lecturer IDs to shorter IDs to save space
student_keys = students["student_id"].unique()
student_keys = {hashed_id: f"s{i}" for i, hashed_id in enumerate(student_keys)}

lecturer_keys = lecturers["lecturer_id"].unique()
lecturer_keys = {hashed_id: f"s{i}" for i, hashed_id in enumerate(lecturer_keys)}

enrollment["student_id"] = enrollment["student_id"].map(student_keys)
students["student_id"] = students["student_id"].map(student_keys)
event_participation["student_id"] = event_participation["student_id"].map(student_keys)
supervision["lecturer_id"] = supervision["lecturer_id"].map(lecturer_keys)
lecturers["lecturer_id"] = lecturers["lecturer_id"].map(lecturer_keys)
event_supervision["lecturer_id"] = event_supervision["lecturer_id"].map(lecturer_keys)

## Create university networks with reduced lecture sizes

### All students

#### Lecture and exam network

In [37]:
reload(ncf)
fractions = ['overbooked', 1.0, 0.5, 0.25]
network_dst = '../../data/networks'
all_days = list(event_dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
verbose = False

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(students, lecturers, event_dates, rooms, all_days,
            event_participation, event_supervision, frac=frac, verbose=verbose)

    # remove disconnected
    #N = len(G.nodes())
    #G.remove_nodes_from(list(nx.isolates(G)))
    #print('removed {} disconnected nodes'.format(N - len(G.nodes())))
    
    N = len(list(nx.isolates(G)))
    print(f"{N} isolated nodes")
    
    # keep only biggest connected component
    #N = len(G.nodes())
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    #G = G.subgraph(Gcc[0])
    #print('removed {} nodes which were not in the biggest component'\
    #      .format(N - len(G.nodes())))
    N = len(Gcc)
    print(f"{N} network components")

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map, N_days=128)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_all.bz2'\
        .format(str(all_days[0].date()), str(all_days[-1].date()), frac)),
        protocol=4)

17 isolated nodes
19 network components
29 isolated nodes
31 network components
141 isolated nodes
143 network components
270 isolated nodes
272 network components


In [46]:
G.nodes()

NodeView(('s2187', 's5595', 's10333', 's333', 's5730', 's6614', 's3974', 's2027', 's3469', 's4810', 's4678', 's5922', 's3904', 's12591', 's7758', 's9874', 's10272', 's11491', 's4007', 's2976', 's5057', 's6037', 's9592', 's4433', 's11598', 's9742', 's11159', 's1825', 's2101', 's513', 's9672', 's2954', 's12313', 's3275', 's4048', 's8730', 's3552', 's9224', 's4655', 's7404', 's1314', 's8659', 's9492', 's9684', 's11646', 's13029', 's4589', 's4076', 's2830', 's8724', 's10544', 's7036', 's7942', 's9119', 's5292', 's2233', 's10456', 's5876', 's9629', 's10067', 's9871', 's9949', 's10760', 's2668', 's3158', 's10676', 's6887', 's5023', 's3396', 's5816', 's11564', 's9214', 's873', 's4848', 's9131', 's2711', 's12753', 's11234', 's7408', 's5296', 's8008', 's6195', 's1460', 's4154', 's13246', 's8293', 's4785', 's6314', 's2956', 's4401', 's12607', 's9541', 's7050', 's8622', 's6940', 's8501', 's116', 's4037', 's1791', 's2990', 's4727', 's9670', 's11225', 's7806', 's11474', 's1473', 's2540', 's11101', 

In [185]:
day_connections = {}
all_edges = G.edges(keys=True, data='day')
for i in range(1, 3 + 1):
    day_edges = [(u, v, k) for (u, v, k, day) in all_edges if day == i]
    day_connections[i] = G.edge_subgraph(day_edges).copy()

In [186]:
areas = [e[2] for e in day_connections[1].edges(data=True)]

In [187]:
areas[0]

{'link_type': 'student_student',
 'event_type': 'VO',
 'date': '2019-10-02 00:00:00',
 'day': 1,
 'weekday': 4,
 'duration': 105,
 'contact_type': 'far'}

#### Only exams and labs network

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(students, lecturers, studies, organisations, groups,
                           dates, rooms, sample_days, estudents, electurers,
                           exams, edates, frac=frac)
    
    # remove all edges that are not associated with exams or with lecture types
    # that require presence
    # see https://mibla-archiv.tugraz.at/08_09/Stk_5/06102008_LV_Typen.pdf
    presence_lecture_types = [
        "LU", # Laborübung
        "KU", # Konstruktionsübung
        "EX", # Exkursion
    ]
    edges_to_remove = [(e[0], e[1]) for e in G.edges(data=True) \
         if e[2]["event_type"] != "exam" \
         and e[2]["lecture_type"] not in presence_lecture_types]
    G.remove_edges_from(edges_to_remove)
    
    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print(f'removed {N - len(G.nodes())} disconnected nodes')
    
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    print(f'The graph now has {len(Gcc)} components')

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_all_exams.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

### Separate data into TU and NaWi

In [126]:
TU_students = students[students['study_label'] == 't']
NaWi_students = students[students['study_label'] == 'n']

print('there are {} TU students'\
        .format(len(TU_students['student_id'].unique())))
print('there are {} NaWi students'\
      .format(len(NaWi_students['student_id'].unique())))

there are 8660 TU students
there are 3298 NaWi students


In [127]:
TU_estudents = estudents[estudents['study_label'] == 't']
NaWi_estudents = estudents[estudents['study_label'] == 'n']

print('there are {} TU exam students'\
        .format(len(TU_estudents['student_id'].unique())))
print('there are {} NaWi exam students'\
      .format(len(NaWi_estudents['student_id'].unique())))

there are 8035 TU exam students
there are 3030 NaWi exam students


In [128]:
TU_studies = studies[studies['student_id'].isin(TU_students['student_id'])]
NaWi_studies = studies[studies['student_id'].isin(NaWi_students['student_id'])]

print('there are {} TU-only studies'\
        .format(len(TU_studies['study_id'].unique())))
print('there are {} NaWi-only studies'\
      .format(len(NaWi_studies['study_id'].unique())))

there are 91 TU-only studies
there are 82 NaWi-only studies


In [129]:
TU_lectures = lectures[lectures['lecture_id'].isin(TU_students['lecture_id'])]
NaWi_lectures = lectures[lectures['lecture_id'].isin(NaWi_students['lecture_id'])]

print('there are {} TU-only lectures'\
        .format(len(TU_lectures['lecture_id'].unique())))
print('there are {} NaWi-only lectures'\
      .format(len(NaWi_lectures['lecture_id'].unique())))

there are 1453 TU-only lectures
there are 790 NaWi-only lectures


In [130]:
TU_lecturers = lecturers[lecturers['lecture_id'].isin(TU_lectures['lecture_id'])]
NaWi_lecturers = lecturers[lecturers['lecture_id'].isin(NaWi_lectures['lecture_id'])]

print('there are {} TU-only lecturers'\
        .format(len(TU_lecturers['lecturer_id'].unique())))
print('there are {} NaWi-only lecturers'\
      .format(len(NaWi_lecturers['lecturer_id'].unique())))

there are 1376 TU-only lecturers
there are 901 NaWi-only lecturers


In [131]:
TU_organisations = organisations[organisations['lecturer_id'].isin(TU_lecturers['lecturer_id'])]
NaWi_organisations = organisations[organisations['lecturer_id'].isin(NaWi_lecturers['lecturer_id'])]

print('there are {} TU-only organisations'\
        .format(len(TU_organisations['organisation_id'].unique())))
print('there are {} NaWi-only organisations'\
      .format(len(NaWi_organisations['organisation_id'].unique())))

there are 121 TU-only organisations
there are 102 NaWi-only organisations


In [132]:
TU_dates = dates[dates['lecture_id'].isin(TU_lectures['lecture_id'])]
NaWi_dates = dates[dates['lecture_id'].isin(NaWi_lectures['lecture_id'])]

print('there are {} TU-only dates'.format(len(TU_dates)))
print('there are {} NaWi-only dates'.format(len(NaWi_dates)))

there are 22436 TU-only dates
there are 14035 NaWi-only dates


In [133]:
TU_edates = edates[edates['exam_id'].isin(TU_estudents['exam_id'])]
NaWi_edates = edates[edates['exam_id'].isin(NaWi_estudents['exam_id'])]

print('there are {} TU-only exam dates'.format(len(TU_edates)))
print('there are {} NaWi-only exam dates'.format(len(NaWi_edates)))

there are 4308 TU-only exam dates
there are 1686 NaWi-only exam dates


In [134]:
TU_rooms = rooms[rooms['room_id'].isin(TU_dates['room_id'])]
NaWi_rooms = rooms[rooms['room_id'].isin(NaWi_dates['room_id'])]

print('there are {} TU-only rooms'.format(len(TU_rooms)))
print('there are {} NaWi-only rooms'.format(len(NaWi_rooms)))

there are 352 TU-only rooms
there are 239 NaWi-only rooms


In [135]:
TU_groups = groups[groups['group_id'].isin(TU_students['group_id'])]
NaWi_groups = groups[groups['group_id'].isin(NaWi_students['group_id'])]

print('there are {} TU-only groups'.format(len(TU_groups['group_id'].unique())))
print('there are {} NaWi-only groups'.format(len(NaWi_groups['group_id'].unique())))

there are 1850 TU-only groups
there are 866 NaWi-only groups


In [146]:
TU_exams = TU_edates[['exam_id', 'lecture_id']].drop_duplicates()
NaWi_exams = NaWi_edates[['exam_id', 'lecture_id']].drop_duplicates()

print('there are {} TU-only exams'.format(len(TU_exams)))
print('there are {} NaWi-only exams'.format(len(NaWi_exams)))

lectures = lectures.set_index("lecture_id")
TU_exams = TU_exams.set_index("lecture_id")
NaWi_exams = NaWi_exams.set_index("lecture_id")
TU_exams = TU_exams.join(lectures["lecture_type"]).reset_index()
NaWi_exams = NaWi_exams.join(lectures["lecture_type"]).reset_index()
lectures = lectures.reset_index()

there are 4233 TU-only exams
there are 1673 NaWi-only exams


In [147]:
TU_electurers = TU_edates[['exam_id', 'lecturer_id']].copy()
NaWi_electurers = NaWi_edates[['exam_id', 'lecturer_id']].copy()

print('there are {} TU-only exam lecturers'\
      .format(len(TU_electurers['lecturer_id'].unique())))
print('there are {} NaWi-only exam lecturers'\
      .format(len(NaWi_electurers['lecturer_id'].unique())))

there are 789 TU-only exam lecturers
there are 366 NaWi-only exam lecturers


### Create TU student only networks

#### Lecture and exam network

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(TU_students, TU_lecturers, TU_studies, 
                           TU_organisations, TU_groups, TU_dates, 
                           TU_rooms, sample_days, TU_estudents, 
                           TU_electurers, TU_exams, TU_edates,
                           frac=frac)

    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print('removed {} disconnected nodes'.format(N - len(G.nodes())))
    
    # keep only biggest connected component
    N = len(G.nodes())
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G = G.subgraph(Gcc[0])
    print('removed {} nodes which were not in the biggest component'\
          .format(N - len(G.nodes())))

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_TU.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

#### Only exams and labs network

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(TU_students, TU_lecturers, TU_studies, 
                           TU_organisations, TU_groups, TU_dates, 
                           TU_rooms, sample_days, TU_estudents, 
                           TU_electurers, TU_exams, TU_edates,
                           frac=frac)
    
    # remove all edges that are not associated with exams or with lecture types
    # that require presence
    # see https://mibla-archiv.tugraz.at/08_09/Stk_5/06102008_LV_Typen.pdf
    presence_lecture_types = [
        "LU", # Laborübung
        "KU", # Konstruktionsübung
        "EX", # Exkursion
    ]
    edges_to_remove = [(e[0], e[1]) for e in G.edges(data=True) \
         if e[2]["event_type"] != "exam" \
         and e[2]["lecture_type"] not in presence_lecture_types]
    G.remove_edges_from(edges_to_remove)
    
    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print(f'removed {N - len(G.nodes())} disconnected nodes')
    
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    print(f'The graph now has {len(Gcc)} components')

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_TU_exams.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

### Create NaWi Students only network

#### Lecture and exam network

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(NaWi_students, NaWi_lecturers, NaWi_studies,
                           NaWi_organisations, NaWi_groups, NaWi_dates,
                           NaWi_rooms, sample_days, NaWi_estudents,
                           NaWi_electurers, NaWi_exams, NaWi_edates,
                           frac=frac)

    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print('removed {} disconnected nodes'.format(N - len(G.nodes())))
    
    # keep only biggest connected component
    N = len(G.nodes())
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G = G.subgraph(Gcc[0])
    print('removed {} nodes which were not in the biggest component'\
          .format(N - len(G.nodes())))

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_NaWi.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

#### Only exams and labs network

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(NaWi_students, NaWi_lecturers, NaWi_studies,
                           NaWi_organisations, NaWi_groups, NaWi_dates,
                           NaWi_rooms, sample_days, NaWi_estudents,
                           NaWi_electurers, NaWi_exams, NaWi_edates,
                           frac=frac)
    
    # remove all edges that are not associated with exams or with lecture types
    # that require presence
    # see https://mibla-archiv.tugraz.at/08_09/Stk_5/06102008_LV_Typen.pdf
    presence_lecture_types = [
        "LU", # Laborübung
        "KU", # Konstruktionsübung
        "EX", # Exkursion
    ]
    edges_to_remove = [(e[0], e[1]) for e in G.edges(data=True) \
         if e[2]["event_type"] != "exam" \
         and e[2]["lecture_type"] not in presence_lecture_types]
    G.remove_edges_from(edges_to_remove)
    
    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print(f'removed {N - len(G.nodes())} disconnected nodes')
    
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    print(f'The graph now has {len(Gcc)} components')

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_NaWi_exams.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)