In [2]:
#import required Libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import missingno as msno
from plotnine import *
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

**DATA PREPROCESSING (preparing data for Analysis)**

DATA CLEANING

In [3]:
#Load datasets
courses = pd.read_csv('~/finalproject/LHL-finalproject/datasets/courses.csv')
assessments = pd.read_csv('~/finalproject/LHL-finalproject/datasets/assessments.csv')
studentinfo = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentInfo.csv')
studentRegistration = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentRegistration.csv')
studentAssessment = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentAssessment.csv')

#Large files, couldn't be loaded to git
studentVle = pd.read_csv('~/finalproject/LHL-finalproject/datasets/studentvle.csv', nrows=999999)
vle = pd.read_csv('~/finalproject/LHL-finalproject/datasets/vle.csv')

Cleaning Courses data file

Handling Missing values

In [4]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 3 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   code_module                 22 non-null     object
 1   code_presentation           22 non-null     object
 2   module_presentation_length  22 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 656.0+ bytes


In [13]:
# Check for missing values
missing_values = courses.isnull().sum()
print(missing_values)

code_module                   0
code_presentation             0
module_presentation_length    0
dtype: int64


There are no missing values in courses 

Cleaning Asessments data file

In [42]:
assessments.replace(0.0, pd.NA, inplace=True)

In [48]:
assessments.replace('?', pd.NA, inplace=True)

In [52]:
#get a random sample of assessments to confirm presence of strange values
random_sample = assessments.sample(n=20, random_state=42)  
print(random_sample)


    code_module code_presentation  id_assessment assessment_type  date weight
15          BBB             2013B          14994             CMA   159    1.0
9           AAA             2014J           1761             TMA   166   20.0
201         GGG             2014J          37443             CMA   229   <NA>
82          DDD             2013B          25335             TMA    53   10.0
68          CCC             2014J          24291             TMA    32    9.0
97          DDD             2014B          25357             TMA    74   17.5
180         GGG             2013J          37422             CMA   229   <NA>
163         FFF             2014J          34904             CMA   241   <NA>
148         FFF             2013J          34877             TMA   173   25.0
183         GGG             2013J          37416             TMA   124   <NA>
101         DDD             2014B          25361            Exam   241  100.0
165         FFF             2014J          34906             CMA

Seems strange values like '?' have been taken care of

In [50]:
# Check for missing values
missing_values = assessments.isnull().sum()
print(missing_values)

code_module           0
code_presentation     0
id_assessment         0
assessment_type       0
date                 11
weight               56
dtype: int64


In [53]:
assessments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   code_module        206 non-null    object
 1   code_presentation  206 non-null    object
 2   id_assessment      206 non-null    int64 
 3   assessment_type    206 non-null    object
 4   date               195 non-null    object
 5   weight             150 non-null    object
dtypes: int64(1), object(5)
memory usage: 9.8+ KB


11 Final exams dates are missing
some courses have weights missing for assignments

**Student Assessment**

In [56]:
#get a random sample of assessments to confirm presence of strange values
random_sample = studentAssessment.sample(n=5, random_state=42)  
print(random_sample)

        id_assessment  id_student  date_submitted  is_banked score
58822           24295      335914              21          0    78
158625          34910      648903             235          0    65
81731           25353      103800             207          0    41
68200           25337      537926             116          0    15
160473          37417      575559             171          0    80


the isbanked column is mostly empty, drop column.

In [59]:
studentAssessment.replace(0.0, pd.NA, inplace=True)
studentAssessment.shape

(173912, 5)

In [58]:
# Check for missing values in student assessments
missing_values = studentAssessment.isnull().sum()
print(missing_values)

id_assessment          0
id_student             0
date_submitted        61
is_banked         172003
score                  0
dtype: int64


In [68]:
#get a random sample to confirm presence of strange values
random_sample = studentAssessment.sample(n=5, random_state=42)  
print(random_sample)


        id_assessment  id_student date_submitted is_banked score
58822           24295      335914             21      <NA>    78
158625          34910      648903            235      <NA>    65
81731           25353      103800            207      <NA>    41
68200           25337      537926            116      <NA>    15
160473          37417      575559            171      <NA>    80


In [66]:
# Check if there are any occurrences of 0.0 in the 'score' column
zero_occurrences = (studentAssessment['score'] == 0.0).any()

if zero_occurrences:
    print("There are occurrences of 0.0 in the 'score' column.")
else:
    print("There are no occurrences of 0.0 in the 'score' column.")


There are no occurrences of 0.0 in the 'score' column.


Cleaning studentinfo data file

In [69]:
studentinfo.replace(0.0, pd.NA, inplace=True)
studentinfo.shape

(32593, 12)

In [70]:
studentinfo.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,,60,N,Pass


In [71]:
# Check for missing values
missing_values = studentinfo.isnull().sum()
print(missing_values)

code_module                 0
code_presentation           0
id_student                  0
gender                      0
region                      0
highest_education           0
imd_band                    0
age_band                    0
num_of_prev_attempts    28421
studied_credits             0
disability                  0
final_result                0
dtype: int64


In [74]:
#get a random sample to confirm presence of strange values
random_sample = studentinfo.sample(n=5, random_state=42)  
print(random_sample)


      code_module code_presentation  id_student gender                region  \
21105         EEE             2014B     2632165      M                 Wales   
30047         FFF             2014J     2678580      M  North Western Region   
7302          BBB             2014J      642122      F              Scotland   
12840         CCC             2014J     1101190      M  North Western Region   
29147         FFF             2014J      656579      F     South West Region   

           highest_education imd_band age_band num_of_prev_attempts  \
21105  A Level or Equivalent   70-80%     0-35                 <NA>   
30047  A Level or Equivalent   60-70%     0-35                 <NA>   
7302        HE Qualification   20-30%    35-55                 <NA>   
12840       HE Qualification   30-40%    35-55                 <NA>   
29147  A Level or Equivalent   50-60%     0-35                 <NA>   

       studied_credits disability final_result  
21105               60          N         P

In [21]:
studentinfo['imd_band'].isnull().shape[0]

32593

Cleaning vle data file

In [22]:
vle.head()

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
0,546943,AAA,2013J,resource,?,?
1,546712,AAA,2013J,oucontent,?,?
2,546998,AAA,2013J,resource,?,?
3,546888,AAA,2013J,url,?,?
4,547035,AAA,2013J,resource,?,?


this shows missing data in the week_from and week_to columns


In [26]:
assessments.replace(float('nan'), np.nan, inplace=True)

In [31]:
vle['week_from'].isnull()/vle.shape[0]

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
6359    0.0
6360    0.0
6361    0.0
6362    0.0
6363    0.0
Name: week_from, Length: 6364, dtype: float64

week_from and week_to tell us the timeframe certain contents are supposed to be used. Unfortunately, over 82% of this content is missing from the dataset.

Cleaning studentRegistration data file

In [36]:
studentRegistration.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159,?
1,AAA,2013J,28400,-53,?
2,AAA,2013J,30268,-92,12
3,AAA,2013J,31604,-52,?
4,AAA,2013J,32885,-176,?


In [38]:
studentRegistration['date_unregistration']

0          ?
1          ?
2         12
3          ?
4          ?
        ... 
32588      ?
32589      ?
32590      ?
32591    101
32592      ?
Name: date_unregistration, Length: 32593, dtype: object

There are a decent amount of students who withdrew from a course before it started. We will remove these students from our dataset later, since we don't have any academic performance data for them.