In [1]:
data_instructions = {
    # Направление подготовки
    "Field of study": {0:'"090301"', 1:'"090304"', 2:'"100503"'},
    # Задолженность за прошлую сессию
    "Last session debt": {0:False, 1:True},
    # Задолженность за предыдущие сессии
    "Other session debt": {0:False, 1:True},
    # Посещение пар
    "Attended classes %": {0:"< 30", 1:"30-50", 2:"50-80", 3:"> 80"},
    # Зарегистрирован в беседах ВК кафедры
    "Registred in VK conversations": {0:False, 1:True},
    # Зарегистрирован в moodle по всем дисципплинам
    "Registred in moodle for all disciplins": {0:False, 1:True},
    # Спортсмен
    "Sportsman": {0:False, 1:True},
    # Активен в студенческих мероприятиях
    "Active in student activities": {0:False, 1:True},
    # Отзывы преподавателей
    "Teacher feedback": {0:"bad", 1:"ok", 2:"good"},
}

In [2]:
import numpy as np
np.random.seed(42)

In [3]:
amount_of_students = 500
data = {}
for colname in data_instructions:
    options = len(data_instructions[colname])
    current_col = []
    for _ in range(amount_of_students):
        new_value = data_instructions[colname][np.random.randint(0, options)]
        current_col.append(new_value)
    data[colname] = current_col

In [4]:
import pandas as pd
df = pd.DataFrame(data)
df

Unnamed: 0,Field of study,Last session debt,Other session debt,Attended classes %,Registred in VK conversations,Registred in moodle for all disciplins,Sportsman,Active in student activities,Teacher feedback
0,"""100503""",True,False,50-80,True,True,False,False,bad
1,"""090301""",True,False,< 30,True,True,False,True,good
2,"""100503""",False,False,> 80,False,True,True,False,good
3,"""100503""",True,False,< 30,False,False,False,False,ok
4,"""090301""",True,True,30-50,True,False,False,True,ok
...,...,...,...,...,...,...,...,...,...
495,"""090304""",False,True,30-50,True,False,False,True,ok
496,"""090304""",True,False,< 30,True,False,True,False,bad
497,"""090301""",True,False,< 30,True,True,True,False,bad
498,"""090304""",False,True,30-50,True,True,False,True,bad


In [5]:
def calculate_label(row) -> bool:
    odds = {
        "no_last_debt": 0.2,
        "good_attendance": 0.2,
        "no_other_debt": 0.1,
        "good_feedback": 0.1,
        "active_student": 0.05,
        "moodle_registered": 0.5,
        "VK_conversations_registered": 0.02,
    }
    prob = 0.9
    if row["Last session debt"]: prob-=odds["no_last_debt"]
    else: prob+=odds["no_last_debt"]

    if row["Attended classes %"] in ["< 30", "30-50"]: prob-=odds["good_attendance"]
    else: prob+=odds["good_attendance"]

    if row["Other session debt"]: prob-=odds["no_other_debt"]
    else: prob+=odds["no_other_debt"]

    if row["Teacher feedback"] == "bad": prob-=odds["good_attendance"]
    elif row["Teacher feedback"] == "good": prob+=odds["good_attendance"]

    if row["Sportsman"] or row["Active in student activities"]: prob+=odds["active_student"]
    else: prob-=odds["active_student"]

    if row["Registred in moodle for all disciplins"]: prob+=odds["moodle_registered"]
    else: prob-=odds["moodle_registered"]

    if row["Registred in VK conversations"]: prob+=odds["VK_conversations_registered"]
    else: prob-=odds["VK_conversations_registered"]

    # Ограничиваем вероятность
    prob = max(0.05, min(prob, 0.95))
    return np.random.random() < prob

In [6]:
# Если определённые критерии не пройдены -- отчислен, иначе продолжает
df['Continues studies'] = df.apply(calculate_label, axis = 1)
df

Unnamed: 0,Field of study,Last session debt,Other session debt,Attended classes %,Registred in VK conversations,Registred in moodle for all disciplins,Sportsman,Active in student activities,Teacher feedback,Continues studies
0,"""100503""",True,False,50-80,True,True,False,False,bad,True
1,"""090301""",True,False,< 30,True,True,False,True,good,True
2,"""100503""",False,False,> 80,False,True,True,False,good,True
3,"""100503""",True,False,< 30,False,False,False,False,ok,False
4,"""090301""",True,True,30-50,True,False,False,True,ok,False
...,...,...,...,...,...,...,...,...,...,...
495,"""090304""",False,True,30-50,True,False,False,True,ok,False
496,"""090304""",True,False,< 30,True,False,True,False,bad,False
497,"""090301""",True,False,< 30,True,True,True,False,bad,True
498,"""090304""",False,True,30-50,True,True,False,True,bad,True


In [7]:
len(df[ df["Continues studies"] == True ])

332

In [8]:
len(df[ df["Continues studies"] == False ])


168

In [9]:
# Сохраняем в файл
df.to_csv(r"students.csv", sep=',', index=False, encoding='utf-8')