In [71]:
import pandas as pd
import numpy as np 
import random 

In [72]:
# get data
data = pd.read_csv("final_data.csv")
# check data types 
data.dtypes

programme                 object
experience_ml             object
experience_ir              int64
experience_st             object
experience_db             object
gender                    object
chatgpt_usage             object
birthday                   int64
count_students             int64
stress_level             float64
bedtime                   object
pleasant_thing_1          object
hours_sport              float64
min awake since 09:00      int64
dtype: object

We will do some type casting as well as removing of variables we might not use (bedtime)

In [73]:
data.drop(columns=["bedtime"], inplace=True)
data

Unnamed: 0,programme,experience_ml,experience_ir,experience_st,experience_db,gender,chatgpt_usage,birthday,count_students,stress_level,pleasant_thing_1,hours_sport,min awake since 09:00
0,AI,yes,0,mu,ja,male,yes,1990,400,78.0,food,0.0,900
1,AI,yes,1,sigma,ja,female,yes,2002,321,100.0,nature,2.0,930
2,Econometrics,yes,1,mu,ja,male,no willing to say,2002,200,100.0,other,0.0,930
3,Econometrics,yes,0,mu,nee,male,yes,2003,350,60.0,nature,2.0,840
4,Bioinformatics,yes,1,mu,ja,male,yes,2000,500,50.0,other,10.0,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,AI,no,1,mu,ja,male,yes,2003,550,40.0,activity,0.0,1140
239,CS,yes,1,mu,ja,male,no willing to say,1990,460,60.0,activity,1.0,990
240,CS,yes,0,sigma,ja,male,no willing to say,2003,300,35.0,nature,10.0,1020
241,CS,yes,1,mu,ja,female,yes,2002,450,75.0,activity,20.0,870


In [74]:
# check for labels in binary variables
for i in ["db", "ml", "ir", "st"]:
    print(data[f"experience_{i}"].unique())

['ja' 'nee']
['yes' 'no']
[0 1]
['mu' 'sigma' '1']


In [75]:
#converting all dichotomus var to binary
data["experience_db"] = (data["experience_db"] == 'ja').astype(int)
data["experience_ml"] = (data["experience_ml"] == 'yes').astype(int)
data["experience_st"] = (data["experience_st"] == "mu").astype(int)

data.dtypes

programme                 object
experience_ml              int32
experience_ir              int64
experience_st              int32
experience_db              int32
gender                    object
chatgpt_usage             object
birthday                   int64
count_students             int64
stress_level             float64
pleasant_thing_1          object
hours_sport              float64
min awake since 09:00      int64
dtype: object

In [76]:
data["experience_db"].value_counts()

experience_db
1    174
0     69
Name: count, dtype: int64

In [77]:
print(data["chatgpt_usage"].value_counts())
data["gender"].value_counts()

chatgpt_usage
yes                  189
no willing to say     40
no                    14
Name: count, dtype: int64


gender
male          141
female         93
non-binary      9
Name: count, dtype: int64

In [78]:
# One hot encodding for gender and chatgpt
data["female"] = (data["gender"] == "female").astype(int)
data["other"] = (data["gender"] == "non-binary").astype(int)
data["no_disclosure"] = (data["chatgpt_usage"] == "no willing to say").astype(int)
data["no_chatgpt"] = (data["chatgpt_usage"] == "no").astype(int)

# drop gender and chatGPT columns
data.drop(columns=["gender", "chatgpt_usage"], inplace=True)

data.columns

Index(['programme', 'experience_ml', 'experience_ir', 'experience_st',
       'experience_db', 'birthday', 'count_students', 'stress_level',
       'pleasant_thing_1', 'hours_sport', 'min awake since 09:00', 'female',
       'other', 'no_disclosure', 'no_chatgpt'],
      dtype='object')

## Classification model: multi-class Logistic regression


In [80]:
# configure data for modeling 
y = data["programme"]
X = data.drop(columns=["programme"])

In [81]:
# pre feature selection
X.columns

Index(['experience_ml', 'experience_ir', 'experience_st', 'experience_db',
       'birthday', 'count_students', 'stress_level', 'pleasant_thing_1',
       'hours_sport', 'min awake since 09:00', 'female', 'other',
       'no_disclosure', 'no_chatgpt'],
      dtype='object')

Our aim is to predict the the program a person is attending to, intuitively I would say that "pleasant_thing_1" is irrelevant for this classification. 
This might also be the case for "chatgpt_usage" and maybe "birthday"

In [82]:
X.drop(columns=["pleasant_thing_1"], inplace=True)

In [83]:
# dependencies for model building
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_selection import f_classif

In [90]:
# We use use ANOVA for feature selection
F, p = f_classif(X, y)
p = np.round(p, decimals=5)
pd.DataFrame({"F": F, "p-value" : p}, index=[f"{X.columns[i]}" for i in range(X.shape[1])])

Unnamed: 0,F,p-value
experience_ml,2.829647,0.01116
experience_ir,4.926693,9e-05
experience_st,2.455483,0.02536
experience_db,7.077444,0.0
birthday,3.780425,0.00129
count_students,0.635634,0.70166
stress_level,0.337793,0.91644
hours_sport,1.422075,0.20679
min awake since 09:00,0.51324,0.79806
female,1.018022,0.41415


In [None]:
# setting up train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [None]:
# model building
model = LogisticRegression(multi_class="multinomial")
model.fit(X_train, y_train)

# evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# compute f1 test
f1 = f1_score(y_test, y_pred, average=)