# Logistic regression

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import os

# Load the data
student_file = os.path.join("..", "data", "processed", "Merged_Final_File_Updated.xlsx")
df = pd.read_excel(student_file)

# Map dependent variable 'dropped out' to binary
df['dropped out'] = df['dropped out'].map({'no': 0, 'yes': 1})

# Define features and target
features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade', 'education_level']
target = 'dropped out'

X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numerical and categorical features
numerical_features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade']
categorical_features = ['education_level']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),  # Fill NA values with 1
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.70
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.69      0.69        32
           1       0.71      0.71      0.71        35

    accuracy                           0.70        67
   macro avg       0.70      0.70      0.70        67
weighted avg       0.70      0.70      0.70        67



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import os

# Load the data
student_file = os.path.join("..", "data", "processed", "Merged_Final_File_Updated.xlsx")
df = pd.read_excel(student_file)

# Map dependent variable 'dropped out' to binary
df['dropped out'] = df['dropped out'].map({'no': 0, 'yes': 1})

# Define features and target
features = ['anl1 final grade', 'anl2 final grade', 'anl3 final grade', 'anl4 final grade']
target = 'dropped out'

X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numerical features (no categorical features anymore)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),  # Fill NA values with 1
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, features)]
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.72
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70        32
           1       0.72      0.74      0.73        35

    accuracy                           0.72        67
   macro avg       0.72      0.72      0.72        67
weighted avg       0.72      0.72      0.72        67



In [4]:
import os
import pandas as pd

# File paths
master_file = os.path.join("..", "data", "processed", "Merged_Student_Data_Schoollevel_final.xlsx")
vooropleiding_file = os.path.join("..", "data", "processed", "cleaned_students_vooropleiding.xlsx")

# Load the data
df_master = pd.read_excel(master_file)
df_vooropleiding = pd.read_excel(vooropleiding_file)

# Merge the files on 'ID'
df_merged = pd.merge(
    df_master,
    df_vooropleiding[['id', 'education_level']],  # Only take 'ID' and 'education_level' columns
    on='id',
    how='left'  # Use 'left' join to keep all rows from the master file
)

# Remove the 'school_level' column
if 'school level' in df_merged.columns:
    df_merged = df_merged.drop(columns=['school level'])

# Save the updated file (optional)
output_file = os.path.join("..", "data", "processed", "Merged_Final_File.xlsx")
df_merged.to_excel(output_file, index=False)

print("Files merged successfully and 'school_level' column removed! Merged file saved to:", output_file)


Files merged successfully and 'school_level' column removed! Merged file saved to: ..\data\processed\Merged_Final_File.xlsx


In [5]:
import os
import pandas as pd

# File paths
master_file = os.path.join("..", "data", "processed", "Merged_Final_File.xlsx")
grades_file = os.path.join("..", "data", "processed", "Grades_Fill1.xlsx")

# Load the files
master_df = pd.read_excel(master_file)
grades_df = pd.read_excel(grades_file)

# Select only the necessary columns from grades_file
grades_columns = ['id', 'ANL1 Final Grade', 'ANL2 Final Grade', 'ANL4 Final Grade']
grades_df = grades_df[grades_columns]

# Merge the files based on 'id'
merged_df = pd.merge(master_df, grades_df, how='left', left_on='id', right_on='id')

# Save the updated master file
output_file = os.path.join("..", "data", "processed", "Merged_Final_File_Updated.xlsx")
merged_df.to_excel(output_file, index=False)

print(f"Merged file saved to: {output_file}")


Merged file saved to: ..\data\processed\Merged_Final_File_Updated.xlsx
