In [137]:
# Student grades predicted based on absence - Eaaa, Autumn 2021
# Sila, 23rd August 2021.

In [138]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [139]:
col_names = ["Student_absence", "grade"]

In [None]:
df = pd.read_csv('StudentAbsenceGrade (1).csv', delimiter=';')

In [141]:
# Check the columns - this dataset should already have student_absence and grade columns
print("Columns in dataset:", df.columns.tolist())
print("First few rows:")
print(df.head())

Columns in dataset: ['student_absence', 'grade']
First few rows:
   student_absence  grade
0                0     10
1               27      7
2                0      7
3                0     10
4               65      0


In [142]:
print ( df .describe( include = 'all' ))

       student_absence       grade
count       156.000000  156.000000
mean         26.615385    6.346154
std          18.953751    3.676699
min           0.000000    0.000000
25%           8.750000    4.000000
50%          24.500000    7.000000
75%          35.000000   10.000000
max          78.000000   12.000000


In [143]:
# Remove empty cells
df = df.dropna()

In [144]:
df.isnull().sum()

student_absence    0
grade              0
dtype: int64

In [145]:
# Clean the grade data: convert Danish grades to 3 categories (00, 7, 12)
# As per exercise instructions:
# - Replace 0 and 2 with 0 (low grade)
# - Replace 4 with 7 (middle grade)
# - Keep 7 as 7 (middle grade)
# - Replace 10 with 12 (high grade)
# - Keep 12 as 12 (high grade)

# Convert to numeric first
df['grade'] = pd.to_numeric(df['grade'], errors='coerce')
df['student_absence'] = pd.to_numeric(df['student_absence'], errors='coerce')

# Remove any rows with NaN values
df = df.dropna()

# Replace values in dataframe using pandas replace() method
df['grade'] = df['grade'].replace({
    0: 0,   # Keep 0 as 0 (low)
    2: 0,   # Replace 2 with 0 (low)
    4: 7,   # Replace 4 with 7 (middle)
    7: 7,   # Keep 7 as 7 (middle)
    10: 12, # Replace 10 with 12 (high)
    12: 12  # Keep 12 as 12 (high)
})

print("Grade distribution after cleaning:")
print(df['grade'].value_counts().sort_index())

Grade distribution after cleaning:
grade
0     34
7     79
12    43
Name: count, dtype: int64


In [146]:
print(df)

     student_absence  grade
0                  0     12
1                 27      7
2                  0      7
3                  0     12
4                 65      0
..               ...    ...
151               23      7
152               62      0
153               38      7
154               54      0
155                8     12

[156 rows x 2 columns]


In [147]:
print(df)

     student_absence  grade
0                  0     12
1                 27      7
2                  0      7
3                  0     12
4                 65      0
..               ...    ...
151               23      7
152               62      0
153               38      7
154               54      0
155                8     12

[156 rows x 2 columns]


In [148]:
X = df[ "student_absence" ]
y = df[ "grade" ]

In [149]:
#Option 1 - Reshaping to (-1,1)
X = np.array(X).reshape(-1,1)

In [150]:
print(y)

0      12
1       7
2       7
3      12
4       0
       ..
151     7
152     0
153     7
154     0
155    12
Name: grade, Length: 156, dtype: int64


In [151]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=51 )
print ("Shape of X_train = ", X_train.shape)
print ("Shape of y_train = ", y_train.shape)
print ("Shape of X_test = ", X_test.shape)
print ("Shape of y_test = ", y_test.shape)

Shape of X_train =  (124, 1)
Shape of y_train =  (124,)
Shape of X_test =  (32, 1)
Shape of y_test =  (32,)


In [152]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs',
                    alpha=1e-5,
                    hidden_layer_sizes=(8,),
                    random_state=1)

mlp.fit(X_train, y_train)

0,1,2
,hidden_layer_sizes,"(8,)"
,activation,'relu'
,solver,'lbfgs'
,alpha,1e-05
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,200
,shuffle,True


In [153]:
from sklearn.metrics import classification_report,confusion_matrix

In [154]:
#predictions
predictions = mlp.predict(X_test)

matrix = confusion_matrix(y_test,predictions)
print (matrix)
print (classification_report(y_test,predictions))

[[ 0  6  0]
 [ 1 15  0]
 [ 1  4  5]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           7       0.60      0.94      0.73        16
          12       1.00      0.50      0.67        10

    accuracy                           0.62        32
   macro avg       0.53      0.48      0.47        32
weighted avg       0.61      0.62      0.57        32

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           7       0.60      0.94      0.73        16
          12       1.00      0.50      0.67        10

    accuracy                           0.62        32
   macro avg       0.53      0.48      0.47        32
weighted avg       0.61      0.62      0.57        32



In [155]:
# EXPERIMENT 1: Use class_weight to handle imbalance
from sklearn.neural_network import MLPClassifier

mlp_balanced = MLPClassifier(solver='lbfgs',
                              alpha=1e-5,
                              hidden_layer_sizes=(6, 6),
                              max_iter=50000,
                              random_state=1)

# Calculate class weights manually
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight('balanced', y_train)

mlp_balanced.fit(X_train, y_train, sample_weight=sample_weights)

predictions_balanced = mlp_balanced.predict(X_test)
print("=== WITH CLASS BALANCING ===")
print(confusion_matrix(y_test, predictions_balanced))
print(classification_report(y_test, predictions_balanced))

=== WITH CLASS BALANCING ===
[[ 5  1  0]
 [ 3 13  0]
 [ 0  5  5]]
              precision    recall  f1-score   support

           0       0.62      0.83      0.71         6
           7       0.68      0.81      0.74        16
          12       1.00      0.50      0.67        10

    accuracy                           0.72        32
   macro avg       0.77      0.72      0.71        32
weighted avg       0.77      0.72      0.71        32



In [156]:
# EXPERIMENT 2: Try Random Forest with class_weight='balanced'
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, 
                            class_weight='balanced',
                            random_state=1,
                            max_depth=10)

rf.fit(X_train, y_train)

predictions_rf = rf.predict(X_test)
print("=== RANDOM FOREST WITH BALANCED CLASSES ===")
print(confusion_matrix(y_test, predictions_rf))
print(classification_report(y_test, predictions_rf))

=== RANDOM FOREST WITH BALANCED CLASSES ===
[[6 0 0]
 [6 9 1]
 [1 3 6]]
              precision    recall  f1-score   support

           0       0.46      1.00      0.63         6
           7       0.75      0.56      0.64        16
          12       0.86      0.60      0.71        10

    accuracy                           0.66        32
   macro avg       0.69      0.72      0.66        32
weighted avg       0.73      0.66      0.66        32



In [157]:
# Test different numbers of trees to see the impact
print("=== TESTING DIFFERENT NUMBERS OF TREES ===\n")

for n_trees in [10, 50, 100, 500, 1000]:
    rf_test = RandomForestClassifier(n_estimators=n_trees, 
                                     class_weight='balanced',
                                     random_state=1,
                                     max_depth=10)
    rf_test.fit(X_train, y_train)
    predictions_test = rf_test.predict(X_test)
    
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y_test, predictions_test)
    print(f"Trees: {n_trees:4d} â†’ Accuracy: {acc:.2%}")
    
print("\nðŸ’¡ Notice: After ~100 trees, accuracy stabilizes!")
print("More trees = longer training time, but not necessarily better results.")

=== TESTING DIFFERENT NUMBERS OF TREES ===

Trees:   10 â†’ Accuracy: 75.00%
Trees:   50 â†’ Accuracy: 65.62%
Trees:  100 â†’ Accuracy: 65.62%
Trees:  500 â†’ Accuracy: 65.62%
Trees: 1000 â†’ Accuracy: 65.62%

ðŸ’¡ Notice: After ~100 trees, accuracy stabilizes!
More trees = longer training time, but not necessarily better results.
