# Create the university network from student and lecturer data

In [2]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
import datetime
import string
import network_creation_functions as ncf
from importlib import reload

In [3]:
src = '../data/processed'

In [4]:
semester_start = ncf.semester_start # date of the semester start
semester_end = ncf.semester_end # date of the semester end
study_map = ncf.study_map # mapping of studies to degree levels

## Load the relevant data

In [5]:
students = pd.read_csv(join(src, 'students_processed.csv'))
lecturers = pd.read_csv(join(src, 'lecturers_processed.csv'))
lectures = pd.read_csv(join(src, 'lectures_processed.csv'))
studies = pd.read_csv(join(src, 'studies_processed.csv'))
organisations = pd.read_csv(join(src, 'organisations_processed.csv'))
groups = pd.read_csv(join(src, 'groups_processed.csv'))
dates = pd.read_csv(join(src, 'dates_processed.csv'))
rooms = pd.read_csv(join(src, 'rooms_processed.csv'))
estudents = pd.read_csv(join(src, 'exam_students_processed.csv'))
electurers = pd.read_csv(join(src, 'exam_lecturers_processed'))
exams = pd.read_csv(join(src, 'exams.csv'))
edates = pd.read_csv(join(src, 'exam_dates.csv'))

In [11]:
# mean area per seat in lecture rooms
clean_rooms = rooms.dropna(subset=['seats', 'area']).copy()
clean_rooms['area_per_seat'] = clean_rooms['area'] / clean_rooms['seats']
clean_rooms['area_per_seat'].mean()

1.9061659968248592

In [9]:
len(students['student_id'].unique())

11861

In [10]:
len(lectures['lecture_id'].unique())

1623

In [11]:
len(lecturers['lecturer_id'].unique())

1475

In [12]:
len(exams['exam_id'].unique())

5557

In [13]:
len(dates)

24983

In [16]:
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]
sample_days

[Timestamp('2019-10-16 00:00:00'),
 Timestamp('2019-10-17 00:00:00'),
 Timestamp('2019-10-18 00:00:00'),
 Timestamp('2019-10-19 00:00:00'),
 Timestamp('2019-10-21 00:00:00'),
 Timestamp('2019-10-22 00:00:00'),
 Timestamp('2019-10-23 00:00:00')]

## Create university networks with reduced lecture sizes

### All students

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(students, lecturers, studies, organisations, groups,
                           dates, rooms, sample_days, estudents, electurers,
                           exams, edates, frac=frac)

    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print('removed {} disconnected nodes'.format(N - len(G.nodes())))
    
    # keep only biggest connected component
    N = len(G.nodes())
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G = G.subgraph(Gcc[0])
    print('removed {} nodes which were not in the biggest component'\
          .format(N - len(G.nodes())))

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_all.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

### Separate data into TU and NaWi

In [120]:
TU_students = students[students['study_label'] == 't']
NaWi_students = students[students['study_label'] == 'n']

print('there are {} TU students'\
        .format(len(TU_students['student_id'].unique())))
print('there are {} NaWi students'\
      .format(len(NaWi_students['student_id'].unique())))

there are 8660 TU students
there are 3298 NaWi students


In [123]:
TU_estudents = estudents[estudents['study_label'] == 't']
NaWi_estudents = estudents[estudents['study_label'] == 'n']

print('there are {} TU exam students'\
        .format(len(TU_estudents['student_id'].unique())))
print('there are {} NaWi exam students'\
      .format(len(NaWi_estudents['student_id'].unique())))

there are 8035 TU exam students
there are 3030 NaWi exam students


In [124]:
TU_studies = studies[studies['student_id'].isin(TU_students['student_id'])]
NaWi_studies = studies[studies['student_id'].isin(NaWi_students['student_id'])]

print('there are {} TU-only studies'\
        .format(len(TU_studies['study_id'].unique())))
print('there are {} NaWi-only studies'\
      .format(len(NaWi_studies['study_id'].unique())))

there are 91 TU-only studies
there are 82 NaWi-only studies


In [125]:
TU_lectures = lectures[lectures['lecture_id'].isin(TU_students['lecture_id'])]
NaWi_lectures = lectures[lectures['lecture_id'].isin(NaWi_students['lecture_id'])]

print('there are {} TU-only lectures'\
        .format(len(TU_lectures['lecture_id'].unique())))
print('there are {} NaWi-only lectures'\
      .format(len(NaWi_lectures['lecture_id'].unique())))

there are 1453 TU-only lectures
there are 790 NaWi-only lectures


In [126]:
TU_lecturers = lecturers[lecturers['lecture_id'].isin(TU_lectures['lecture_id'])]
NaWi_lecturers = lecturers[lecturers['lecture_id'].isin(NaWi_lectures['lecture_id'])]

print('there are {} TU-only lecturers'\
        .format(len(TU_lecturers['lecturer_id'].unique())))
print('there are {} NaWi-only lecturers'\
      .format(len(NaWi_lecturers['lecturer_id'].unique())))

there are 1376 TU-only lecturers
there are 901 NaWi-only lecturers


In [127]:
TU_organisations = organisations[organisations['lecturer_id'].isin(TU_lecturers['lecturer_id'])]
NaWi_organisations = organisations[organisations['lecturer_id'].isin(NaWi_lecturers['lecturer_id'])]

print('there are {} TU-only organisations'\
        .format(len(TU_organisations['organisation_id'].unique())))
print('there are {} NaWi-only organisations'\
      .format(len(NaWi_organisations['organisation_id'].unique())))

there are 121 TU-only organisations
there are 102 NaWi-only organisations


In [128]:
TU_dates = dates[dates['lecture_id'].isin(TU_lectures['lecture_id'])]
NaWi_dates = dates[dates['lecture_id'].isin(NaWi_lectures['lecture_id'])]

print('there are {} TU-only dates'.format(len(TU_dates)))
print('there are {} NaWi-only dates'.format(len(NaWi_dates)))

there are 22436 TU-only dates
there are 14035 NaWi-only dates


In [129]:
TU_edates = edates[edates['exam_id'].isin(TU_estudents['exam_id'])]
NaWi_edates = edates[edates['exam_id'].isin(NaWi_estudents['exam_id'])]

print('there are {} TU-only exam dates'.format(len(TU_edates)))
print('there are {} NaWi-only exam dates'.format(len(NaWi_edates)))

there are 4308 TU-only exam dates
there are 1686 NaWi-only exam dates


In [130]:
TU_rooms = rooms[rooms['room_id'].isin(TU_dates['room_id'])]
NaWi_rooms = rooms[rooms['room_id'].isin(NaWi_dates['room_id'])]

print('there are {} TU-only rooms'.format(len(TU_rooms)))
print('there are {} NaWi-only rooms'.format(len(NaWi_rooms)))

there are 352 TU-only rooms
there are 239 NaWi-only rooms


In [131]:
TU_groups = groups[groups['group_id'].isin(TU_students['group_id'])]
NaWi_groups = groups[groups['group_id'].isin(NaWi_students['group_id'])]

print('there are {} TU-only groups'.format(len(TU_groups['group_id'].unique())))
print('there are {} NaWi-only groups'.format(len(NaWi_groups['group_id'].unique())))

there are 1850 TU-only groups
there are 866 NaWi-only groups


In [133]:
TU_exams = TU_edates[['exam_id', 'lecture_id']].drop_duplicates()
NaWi_exams = NaWi_edates[['exam_id', 'lecture_id']].drop_duplicates()

print('there are {} TU-only exams'.format(len(TU_exams)))
print('there are {} NaWi-only exams'.format(len(NaWi_exams)))

there are 4233 TU-only exams
there are 1673 NaWi-only exams


In [134]:
TU_electurers = TU_edates[['exam_id', 'lecturer_id']].copy()
NaWi_electurers = NaWi_edates[['exam_id', 'lecturer_id']].copy()

print('there are {} TU-only exam lecturers'\
      .format(len(TU_electurers['lecturer_id'].unique())))
print('there are {} NaWi-only exam lecturers'\
      .format(len(NaWi_electurers['lecturer_id'].unique())))

there are 789 TU-only exam lecturers
there are 366 NaWi-only exam lecturers


### Create TU student only networks

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(TU_students, TU_lecturers, TU_studies, 
                           TU_organisations, TU_groups, TU_dates, 
                           TU_rooms, sample_days, TU_estudents, 
                           TU_electurers, TU_exams, TU_edates,
                           frac=frac)

    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print('removed {} disconnected nodes'.format(N - len(G.nodes())))
    
    # keep only biggest connected component
    N = len(G.nodes())
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G = G.subgraph(Gcc[0])
    print('removed {} nodes which were not in the biggest component'\
          .format(N - len(G.nodes())))

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_TU.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)

### Create NaWi Students only network

In [None]:
reload(ncf)
fractions = [0.25, 0.5, 1.0, 'overbooked']
network_dst = '../data/networks'
all_days = list(dates['date'].unique())
all_days = [pd.to_datetime(sd) for sd in all_days]
all_days.sort()
sample_days = all_days[13:20]

contact_map = {
    'student_lecturer':'far', 
    'student_student':'far',
    'lecturer_lecturer':'far',
}

for frac in fractions:

    # create the full contact network for the whole university with a given
    # fraction of studence in presence
    G = ncf.create_network(NaWi_students, NaWi_lecturers, NaWi_studies,
                           NaWi_organisations, NaWi_groups, NaWi_dates,
                           NaWi_rooms, sample_days, NaWi_estudents,
                           NaWi_electurers, NaWi_exams, NaWi_edates,
                           frac=frac)

    # remove disconnected
    N = len(G.nodes())
    G.remove_nodes_from(list(nx.isolates(G)))
    print('removed {} disconnected nodes'.format(N - len(G.nodes())))
    
    # keep only biggest connected component
    N = len(G.nodes())
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
    G = G.subgraph(Gcc[0])
    print('removed {} nodes which were not in the biggest component'\
          .format(N - len(G.nodes())))

    # map link types to contact strengths
    ncf.map_contacts(G, contact_map)

    # save the graph
    nx.readwrite.gpickle.write_gpickle(G, join(network_dst,
        'university_{}_to_{}_fraction-{}_NaWi.bz2'\
        .format(str(sample_days[0].date()), str(sample_days[-1].date()), frac)),
        protocol=4)