Sedaj, ko imamo podatke o študentih, nas zanima še kaj o predmetih, predvsem pass rate čez leta za vsak predmet. Spisali bomo funkcijo, ki iz dane tabele "obrne" podatke iz `student_id X semester` (celica vsebuje vse predmete študenta v semestru) v `subject_id X semester` (celica vsebuje vse študente na predmetu v semestru). Druga funkcija bo naredila isto, le da bo vrnila samo št. študentov.

In [5]:
import pandas as pd
import ast

students = pd.read_csv("./students_no_empty_rows.csv")
subjects = pd.read_csv("./subjects_3.csv")
subjects_name = subjects.iloc[:, :2]

def transform_to_subject_centric(students, subjects_name):
    # slovar ki bo hranil pare subject_id : seznam vseh student_id, ki so imeli ta predmet (po letih in semestrih)
    subject_data = {}

    year_semester_columns = students.columns[1:]

    for index, student in students.iterrows():
        student_id = student['student_id']

        for column in year_semester_columns:
            if not pd.isna(student[column]):
                subjects = ast.literal_eval(student[column])

                for subject_id in subjects:
                    if (subject_id[:3] == '632') or (subject_id[:3] == '637'):
                        if subject_id not in subject_data:
                            subject_data[subject_id] = {col: [] for col in year_semester_columns}

                        subject_data[subject_id][column].append(student_id)

    # iz slovarja naredimo data frame, ki bo neurejen, zato ga še posortiramo in dodamo imena predmetov
    subject_centric_df = pd.DataFrame.from_dict(subject_data, orient='index')
    subject_centric_df = subject_centric_df.sort_index()
    subject_centric_df = subject_centric_df.reset_index().rename(columns={'index': 'subject_id'})
    
    # dodamo imena predmetov
    subject_centric_df = pd.merge(subject_centric_df, subjects_name, on='subject_id', how='left')

    columns_order = ['subject_id', 'subject_name'] + [col for col in subject_centric_df.columns[1:-1]]
    subject_centric_df = subject_centric_df[columns_order]
    subject_centric_df = subject_centric_df.set_index('subject_id')

    return subject_centric_df


def count_students_per_subject(subjects_centric_view):
    subjects_count = subjects_centric_view.copy()
    for index, row in subjects_centric_view.iterrows():
        for col in subjects_centric_view.columns[2:]:
            subjects_count.at[index, col] = len(ast.literal_eval(row[col])) if ((pd.notna(row[col])) and (row[col] != [])) else 0
    return subjects_count

In [29]:
transform_to_subject_centric(students, subjects_name).to_csv("subject_centric_view.csv", index=True)
subjects_centric = pd.read_csv("./subject_centric_view.csv")
# shranimo in preberemo csv, da funkcija count_students_per_subjects pravilno dela
count_students_per_subject(subjects_centric).to_csv("subject_centric_count.csv", index=False)

Zdaj se lahko lotimo računanja prepustnosti po predmetih. Ker imamo podatke le o študentih, ki so bili vpisani leta 2019 ali kasneje, to pomeni, da za leto 2019 lahko preverimo prepustnost pri predmetih 1. letnika. Iz tabele `subjects_3.csv` bomo vzeli podatke iz vrstice `student_year`. Značke nam povejo, v katerem letniku programa se ta predmet izvaja. Spisali bomo funkcijo, ki bo vzela najvišjo številko, saj so nekateri predmeti hkrati obvezni in izbirni. Potem bomo izračunali prepustnost od leta `2019 + najvišji letnik izvajanja` naprej. 

In [7]:
import re

def extract_numbers_from_list(numbers_list):
    digits = [digit for s in numbers_list for digit in re.findall(r'\d+', s)]
    return ''.join(digits) if digits else None


def highest_number(numbers_list):
    if numbers_list is None:
        return 0
    else:
        numbers_string = str(numbers_list)
        digits = [int(digit) for digit in numbers_string]
        return str(max(digits))

In [30]:
subjects_years = pd.read_csv("./subjects_3.csv")
subjects_years = subjects_years.iloc[:, [0, 3, 6]]

subjects_years['student_year'] = subjects_years['student_year'].apply(extract_numbers_from_list)
subjects_years['student_year'] = subjects_years['student_year'].apply(highest_number)
subjects_years['subject_semester'] = subjects_years['subject_semester'].apply(lambda x: str(x))
subjects_years.head(10)

Unnamed: 0,subject_id,subject_semester,student_year
0,63202,1,1
1,63203,1,1
2,63204,1,1
3,63205,1,1
4,63207,2,1
5,63208,1,2
6,63209,2,1
7,63212,2,1
8,63213,1,2
9,63214,1,3


In [28]:
# mergamo podatke o letniku in semestru v subjects_centric_years tabelo
subjects_centric_years = pd.read_csv("./subject_centric_view.csv")
subjects_centric_years.head(10)
subject_centric_years = pd.merge(subjects_centric_years, subjects_years, on='subject_id', how='left')
subject_centric_years = subject_centric_years.set_index('subject_id')

# pogrunti neki da se znebim semestrov in dvojnih stolpcev in samo pustim letnico


subject_centric_years.to_csv("./subject_centric_view.csv")
subject_centric_years.head(5)

Unnamed: 0_level_0,subject_name,19/20_1,19/20_2,20/21_1,20/21_2,21/22_1,21/22_2,22/23_1,22/23_2,23/24_1,23/24_2,24/25_1,24/25_2,subject_semester_x,student_year_x,subject_semester_y,student_year_y,subject_semester,student_year
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
63202,Osnove matematične analize,"[63190045, 63190046, 63190050, 63190052, 63190...",[],"[63190012, 63190026, 63190046, 63190059, 63190...",[],"[63190009, 63190012, 63190091, 63190243, 63200...",[],"[63190091, 63190106, 63190243, 63200034, 63200...",[],"[63190243, 63210138, 63210156, 63210165, 63210...",[],"[63190243, 63210167, 63210238, 63220003, 63220...",[],1.0,1.0,1.0,1.0,1,1
63203,Diskretne strukture,"[63190045, 63190046, 63190050, 63190052, 63190...",[],"[63190012, 63190026, 63190046, 63190091, 63190...",[],"[63190009, 63190012, 63190243, 63200027, 63200...",[],"[63200034, 63200345, 63200488, 63210021, 63210...",[],"[63210085, 63210167, 63220007, 63220019, 63220...",[],"[63210085, 63220003, 63230014, 63230032, 63230...",[],1.0,1.0,1.0,1.0,1,1
63204,Osnove digitalnih vezij,"[63190001, 63190002, 63190003, 63190004, 63190...",[],"[63190012, 63190026, 63190046, 63190050, 63190...",[],"[63190009, 63190114, 63200011, 63200027, 63200...",[],"[63210028, 63210085, 63210128, 63210186, 63210...",[],"[63210085, 63220007, 63220019, 63220092, 63220...",[],"[63220003, 63230070, 63230158, 63230174, 63230...",[],1.0,1.0,1.0,1.0,1,1
63205,Fizika,"[63190045, 63190046, 63190050, 63190052, 63190...",[],"[63190012, 63190026, 63190091, 63190103, 63190...",[],"[63190009, 63190032, 63190067, 63190091, 63190...",[],"[63200034, 63200128, 63200248, 63210114, 63210...",[],"[63200034, 63210138, 63210243, 63210301, 63210...",[],"[63210243, 63220003, 63220172, 63220246, 63220...",[],1.0,1.0,1.0,1.0,1,1
63207,Linearna algebra,[],"[63190045, 63190046, 63190050, 63190052, 63190...",[],"[63190012, 63190026, 63190046, 63190050, 63190...",[],"[63190009, 63190091, 63190127, 63190243, 63190...",[],"[63190106, 63190127, 63200034, 63200074, 63200...",[],"[63190127, 63210085, 63210138, 63210156, 63210...",[],"[63190127, 63210085, 63210200, 63210238, 63210...",2.0,1.0,2.0,1.0,2,1


In [None]:
def combine_year_columns(table):
    combined = []

    for index, row in table.iterrows():
        for year in [2019, 2024]:
            sub_index = int((str(year))[2:]) - 18 + table['subject_semester']
            combined[str(year)] = table[sub_index]
    
    return pd.DataFrame(combined)

Zdaj spišemo funkcijo, ki bo vzela tabelo `subject_centric_view` in izračunala, koliko študentov je vsako leto opravilo predmet.

In [None]:
def pass_rate(subject_centric):
    subject_pass_rate = subject_centric.copy()

    for index, row in subject_pass_rate.iterrows():
        year_start = 2019 + row['student_year']
        year_end = 2023

        for year in [year_start, year_end]:
            first_group = row[str(year)]
            second_group = row[str(year+1)]
            if first_group != "" and pd.notna(first_group) and second_group != "" and pd.notna(second_group):
                total = len(first_group)
                overlap = len(set(first_group).intersection(set(second_group)))
                subject_pass_rate[str(year)] = f"{(total - overlap)/total}%"

    return subject_pass_rate


SyntaxError: invalid syntax (2232198471.py, line 3)

______________________________________________
TODO
- izračunaj pass rate (upostevi se letnik v katerem se izvaja (ce je v 3. letniku, lahko sele 2021 gledamo, ker imamo samo ljudi od 2019 naprej))
- spliti schedule.ipynb in student_ids.ipynb
- keri predmeti iz subjects.csv manjkaj
- keri predmeti nimajo imena --> a jih brišemo?
- lahko bi še napisala funkcijo ki bi povedala, a je študent kadarkoli ponavljal, in koliko jih je zaključilo (če so imeli dip sem pa recimo, da so diplomirali)