Sedaj, ko imamo podatke o študentih, nas zanima še kaj o predmetih, predvsem pass rate čez leta za vsak predmet. Spisali bomo funkcijo, ki iz dane tabele "obrne" podatke iz `student_id X semester` (celica vsebuje vse predmete študenta v semestru) v `subject_id X semester` (celica vsebuje vse študente na predmetu v semestru). Druga funkcija bo naredila isto, le da bo vrnila samo št. študentov.

In [None]:
import pandas as pd
import ast

students = pd.read_csv("./checkpoint/students_no_empty_rows.csv")
subjects = pd.read_csv("./subjects_3.csv")
subjects_name = subjects.iloc[:, :2]

def transform_to_subject_centric(students, subjects_name):
    # slovar ki bo hranil pare subject_id : seznam vseh student_id, ki so imeli ta predmet (po letih in semestrih)
    subject_data = {}

    year_semester_columns = students.columns[1:]

    for index, student in students.iterrows():
        student_id = student['student_id']

        for column in year_semester_columns:
            if not pd.isna(student[column]):
                subjects = ast.literal_eval(student[column])

                for subject_id in subjects:
                    if (subject_id[:3] == '632') or (subject_id[:3] == '637'):
                        if subject_id not in subject_data:
                            subject_data[subject_id] = {col: [] for col in year_semester_columns}

                        subject_data[subject_id][column].append(student_id)

    # iz slovarja naredimo data frame, ki bo neurejen, zato ga še posortiramo in dodamo imena predmetov
    subject_centric_df = pd.DataFrame.from_dict(subject_data, orient='index')
    subject_centric_df = subject_centric_df.sort_index()
    subject_centric_df = subject_centric_df.reset_index().rename(columns={'index': 'subject_id'})
    
    # dodamo imena predmetov
    subject_centric_df = pd.merge(subject_centric_df, subjects_name, on='subject_id', how='left')

    columns_order = ['subject_id', 'subject_name'] + [col for col in subject_centric_df.columns[1:-1]]
    subject_centric_df = subject_centric_df[columns_order]
    subject_centric_df = subject_centric_df.set_index('subject_id')

    return subject_centric_df


def count_students_per_subject(subjects_centric_view):
    subjects_count = subjects_centric_view.copy()
    for index, row in subjects_centric_view.iterrows():
        for col in subjects_centric_view.columns[2:]:
            subjects_count.at[index, col] = len(ast.literal_eval(row[col])) if ((pd.notna(row[col])) and (row[col] != [])) else 0
    return subjects_count

In [None]:
transform_to_subject_centric(students, subjects_name).to_csv("./checkpoint/subject_centric_view.csv", index=True)
subjects_centric = pd.read_csv("./checkpoint/subject_centric_view.csv")
# shranimo in preberemo csv, da funkcija count_students_per_subjects pravilno dela
count_students_per_subject(subjects_centric).to_csv("./checkpoint/subject_centric_count.csv", index=False)

Zdaj se lahko lotimo računanja prepustnosti po predmetih. Ker imamo podatke le o študentih, ki so bili vpisani leta 2019 ali kasneje, to pomeni, da za leto 2019 lahko preverimo prepustnost pri predmetih 1. letnika. Iz tabele `subjects_3.csv` bomo vzeli podatke iz vrstice `student_year`. Značke nam povejo, v katerem letniku programa se ta predmet izvaja. Spisali bomo funkcijo, ki bo vzela najvišjo številko, saj so nekateri predmeti hkrati obvezni in izbirni. Potem bomo izračunali prepustnost od leta `2019 + najvišji letnik izvajanja` naprej. 

In [28]:
import re

def extract_numbers_from_list(numbers_list):
    digits = [digit for s in numbers_list for digit in re.findall(r'\d+', s)]
    return ''.join(digits) if digits else None


def highest_number(numbers_list):
    if numbers_list is None:
        return 0
    else:
        numbers_string = str(numbers_list)
        digits = [int(digit) for digit in numbers_string]
        return str(max(digits))
    

def combine_semesters(row, year_prefix):
    col1 = f"{year_prefix}_1"
    col2 = f"{year_prefix}_2"

    list1 = ast.literal_eval(row[col1]) if pd.notna(row[col1]) else []
    list2 = ast.literal_eval(row[col2]) if pd.notna(row[col2]) else []

    combined_data = [*list1, *list2]
    
    return combined_data



In [61]:
subjects_years = pd.read_csv("./subjects_3.csv")
subjects_years = subjects_years.iloc[:, [0, 2, 3, 6]]

subjects_years['student_year'] = subjects_years['student_year'].apply(extract_numbers_from_list)
subjects_years['student_year'] = subjects_years['student_year'].apply(highest_number)
subjects_years['subject_semester'] = subjects_years['subject_semester'].apply(lambda x: str(x))
subjects_years.head(5)

Unnamed: 0,subject_id,subject_program,subject_semester,student_year
0,63202,UNI,1,1
1,63203,UNI,1,1
2,63204,UNI,1,1
3,63205,UNI,1,1
4,63207,UNI,2,1


In [None]:
# mergamo podatke o letniku in semestru v subjects_centric_years tabelo
subjects_centric_years = pd.read_csv("./checkpoint/subject_centric_view.csv")
subjects_centric_years.head(10)
subject_centric_years = pd.merge(subjects_centric_years, subjects_years, on='subject_id', how='left')
subject_centric_years = subject_centric_years.set_index('subject_id')
subject_centric_years = subject_centric_years.dropna(subset=['subject_name'])

for year in range(2019, 2025):
    year_prefix = f"{str(year)[2:]}/{str(year+1)[2:]}"
    subject_centric_years[str(year)] = subject_centric_years.apply(lambda row: combine_semesters(row, year_prefix), axis=1)
    subject_centric_years.drop(columns=[f"{year_prefix}_1", f"{year_prefix}_2"], inplace=True)

subject_centric_years.to_csv("./checkpoint/subject_centric_years.csv")
subject_centric_years.head(5)

Unnamed: 0_level_0,subject_name,subject_program,subject_semester,student_year,2019,2020,2021,2022,2023,2024
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
63202,Osnove matematične analize,UNI,1,1,"[63190045, 63190046, 63190050, 63190052, 63190...","[63190012, 63190026, 63190046, 63190059, 63190...","[63190009, 63190012, 63190091, 63190243, 63200...","[63190091, 63190106, 63190243, 63200034, 63200...","[63190243, 63210138, 63210156, 63210165, 63210...","[63190243, 63210167, 63210238, 63220003, 63220..."
63203,Diskretne strukture,UNI,1,1,"[63190045, 63190046, 63190050, 63190052, 63190...","[63190012, 63190026, 63190046, 63190091, 63190...","[63190009, 63190012, 63190243, 63200027, 63200...","[63200034, 63200345, 63200488, 63210021, 63210...","[63210085, 63210167, 63220007, 63220019, 63220...","[63210085, 63220003, 63230014, 63230032, 63230..."
63204,Osnove digitalnih vezij,UNI,1,1,"[63190001, 63190002, 63190003, 63190004, 63190...","[63190012, 63190026, 63190046, 63190050, 63190...","[63190009, 63190114, 63200011, 63200027, 63200...","[63210028, 63210085, 63210128, 63210186, 63210...","[63210085, 63220007, 63220019, 63220092, 63220...","[63220003, 63230070, 63230158, 63230174, 63230..."
63205,Fizika,UNI,1,1,"[63190045, 63190046, 63190050, 63190052, 63190...","[63190012, 63190026, 63190091, 63190103, 63190...","[63190009, 63190032, 63190067, 63190091, 63190...","[63200034, 63200128, 63200248, 63210114, 63210...","[63200034, 63210138, 63210243, 63210301, 63210...","[63210243, 63220003, 63220172, 63220246, 63220..."
63207,Linearna algebra,UNI,2,1,"[63190045, 63190046, 63190050, 63190052, 63190...","[63190012, 63190026, 63190046, 63190050, 63190...","[63190009, 63190091, 63190127, 63190243, 63190...","[63190106, 63190127, 63200034, 63200074, 63200...","[63190127, 63210085, 63210138, 63210156, 63210...","[63190127, 63210085, 63210200, 63210238, 63210..."


Zdaj spišemo funkcijo, ki bo vzela tabelo `subject_centric_view` in izračunala, koliko študentov je vsako leto opravilo predmet.

In [65]:
def pass_rate(subject_centric):
    columns_to_keep = ['subject_id', 'subject_name', 'subject_program', 'subject_semester', 'student_year']
    subject_pass_rate = subject_centric[columns_to_keep].copy()

    for year in range(2019, 2024):
        subject_pass_rate[f'{year}'] = ''

    for index, row in subject_centric.iterrows():
        offset = 3 if int(row['student_year']) == 0 else int(row['student_year'])
        year_start = 2018 + offset
        year_end = 2023

        for year in range(year_start, year_end + 1):

            first_year = row[str(year)]
            next_year = row[str(year+1)]

            first_year = ast.literal_eval(first_year) if pd.notna(first_year) and first_year != "" else []
            next_year = ast.literal_eval(next_year) if pd.notna(next_year) and next_year != "" else []

            total = len(first_year)

            if total <= 5:
                subject_pass_rate.at[index, f'{year}'] = f"{total} students enrolled - not enough data"
                continue

            overlap = len(set(first_year).intersection(set(next_year)))
            pass_rate = ((total - overlap) / total) * 100

            pass_rate_value = round(pass_rate, 1)
            subject_pass_rate.at[index, f'{year}'] = f"{pass_rate_value}%"

    return subject_pass_rate


In [None]:
subject_centric = pd.read_csv("./checkpoint/subject_centric_years.csv")

pass_rate = pass_rate(subject_centric)
pass_rate.to_csv("./pass_rate.csv", index=False)

______________________________________________
TODO
- briši ene subjecte k nimajo dovolj ljudi
- ene vrednosti so 100% čudne
- kdo najdlje delal OVS oz poljuben predmet
- koliko ljudi dela faks 1 leto, procentaža od 2019 do 2023