In [2]:
#import required Libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import missingno as msno
from plotnine import *
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

**DATA PREPROCESSING (preparing data for Analysis)**

DATA CLEANING

In [3]:
#Load datasets
courses = pd.read_csv('~/finalproject/LHL-finalproject/datasets/courses.csv')
assessments = pd.read_csv('~/finalproject/LHL-finalproject/datasets/assessments.csv')
studentinfo = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentInfo.csv')
studentRegistration = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentRegistration.csv')
studentAssessment = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentAssessment.csv')

#Large files, couldn't be loaded to git
studentVle = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentvle.csv', nrows=999999)
vle = pd.read_csv('~/finalproject/LHL-finalproject/datasets/vle.csv')

Cleaning Courses data file

Handling Missing values

In [4]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 3 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   code_module                 22 non-null     object
 1   code_presentation           22 non-null     object
 2   module_presentation_length  22 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 656.0+ bytes


In [13]:
# Check for missing values
missing_values = courses.isnull().sum()
print(missing_values)

code_module                   0
code_presentation             0
module_presentation_length    0
dtype: int64


There are no missing values in courses 

Cleaning Asessments data file

In [15]:
# Check for missing values
missing_values = assessments.isnull().sum()
print(missing_values)

code_module          0
code_presentation    0
id_assessment        0
assessment_type      0
date                 0
weight               0
dtype: int64


In [9]:
assessments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code_module        206 non-null    object 
 1   code_presentation  206 non-null    object 
 2   id_assessment      206 non-null    int64  
 3   assessment_type    206 non-null    object 
 4   date               206 non-null    object 
 5   weight             206 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 9.8+ KB


In [39]:
# Check for missing values in student assessments
missing_values = studentAssessment.isnull().sum()
print(missing_values)

id_assessment     0
id_student        0
date_submitted    0
is_banked         0
score             0
dtype: int64


In [40]:
studentAssessment['score'].isnull()/studentAssessment.shape[0]

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
173907    0.0
173908    0.0
173909    0.0
173910    0.0
173911    0.0
Name: score, Length: 173912, dtype: float64

There are no missing values in assessments
however there are some in scores

According to the data's documentation, score ranges from 0 to 100. Score lower than 40 is interpreted as a Fail. If the student does not submit the assessment, no result is recorded. Most of the missing scores are from final exams

Cleaning studentinfo data file

In [16]:
studentinfo.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [17]:
# Check for missing values
missing_values = studentinfo.isnull().sum()
print(missing_values)

code_module             0
code_presentation       0
id_student              0
gender                  0
region                  0
highest_education       0
imd_band                0
age_band                0
num_of_prev_attempts    0
studied_credits         0
disability              0
final_result            0
dtype: int64


In [21]:
studentinfo['imd_band'].isnull().shape[0]

32593

Cleaning vle data file

In [22]:
vle.head()

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
0,546943,AAA,2013J,resource,?,?
1,546712,AAA,2013J,oucontent,?,?
2,546998,AAA,2013J,resource,?,?
3,546888,AAA,2013J,url,?,?
4,547035,AAA,2013J,resource,?,?


this shows missing data in the week_from and week_to columns


In [26]:
assessments.replace(float('nan'), np.nan, inplace=True)

In [31]:
vle['week_from'].isnull()/vle.shape[0]

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
6359    0.0
6360    0.0
6361    0.0
6362    0.0
6363    0.0
Name: week_from, Length: 6364, dtype: float64

week_from and week_to tell us the timeframe certain contents are supposed to be used. Unfortunately, over 82% of this content is missing from the dataset.

Cleaning studentRegistration data file

In [36]:
studentRegistration.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159,?
1,AAA,2013J,28400,-53,?
2,AAA,2013J,30268,-92,12
3,AAA,2013J,31604,-52,?
4,AAA,2013J,32885,-176,?


In [38]:
studentRegistration['date_unregistration']

0          ?
1          ?
2         12
3          ?
4          ?
        ... 
32588      ?
32589      ?
32590      ?
32591    101
32592      ?
Name: date_unregistration, Length: 32593, dtype: object

There are a decent amount of students who withdrew from a course before it started. We will remove these students from our dataset later, since we don't have any academic performance data for them.