# Exploratory Data Analysis
Using EDA I will analyse the data to identify trends and correlations between variables and the final grade.

> Import Student data.

In [None]:
import pandas as pd

# Import student data
df = pd.read_csv('data/student/x.csv', sep=',')
df.head()

> Checking skewness of the final grade distribution

In [None]:
# plot histogram of final grade (G3)
plt.hist(df['G3'])
plt.xlabel('G3')
plt.ylabel('Frequency')
plt.title('Final Grade distribution')

data looks about normally distributed, with a modal score at around 11-12

> Feature Selection
* Correlation coefficient to determine most useuful variables

In [None]:
# obtain variables with correlation to final grade and sort
df.corr()['G3'].sort_values()

anomalie: absences show positive correlation, where this should be expected as a negative correlation

> hot encodeing categorical variables

In [None]:
# select categorical values only
categorical_df = df.select_dtypes('object')

# ensure there are no null values
categorical_df.isnull()

categorical_df.head()

In [None]:
# One-hot encode
ohe_df = pd.get_dummies(categorical_df)


ohe_df['G3'] = df['G3']

# get correlations
ohe_df.corr()['G3'].sort_values()

> return most correlated variables including both numerical and categorical

In [None]:
# return most correlated variables with the final grade
# parameters: df - data frame, size - amount of variables to retain
def getMostCorrelated(df, size):
    # One-Hot Encode Categorical Variables
    df = pd.get_dummies(df)
    
    # Find correlations with the Grade
    greatest = df.corr().abs()['G3'].sort_values(ascending=False)
    
    # retain correlated variables according to size passed
    
    greatest = greatest[:size+1]
    print(greatest)
    
    df = df.loc[:, greatest.index]
    
    return df

In [None]:
getMostCorrelated(df, 6)

> split the data into training and testing data

In [None]:
# Split the data for training and testing
# split: 25%
from sklearn.model_selection import train_test_split

def split_data(df):
    X_train, X_test, y_train, y_test = train_test_split(df, df['G3'], test_size=0.25, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_data(getMostCorrelated(df, ))
X_train.head()

> Baseline metrics

In [122]:
import numpy as np

# get the median of the final grade
median_pred = X_train['G3'].median()
print(median_pred)

predictions = [median_pred for _ in range(len(X_test))]
print(predictions)


true_labels = X_test['G3']
print(true)

11.0
[11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0]
78     10
371    12
248     5
55     10
390     9
       ..
367     0
210     8
75     10
104    18
374    19
Name: G3, Length: 99, dtype: int64


final grade median: 11.0

In [125]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# Display the naive baseline metrics

baseline_mae = mean_absolute_error(true_labels, predictions)
baseline_rmse = mean_squared_error(true_labels, predictions, squared=False)

print('Median Baseline  MAE: {:.4f}'.format(baseline_mae))
print('Median Baseline RMSE: {:.4f}'.format(baseline_rmse))

Median Baseline  MAE: 3.7879
Median Baseline RMSE: 4.8252


> comparison of models against baseline