# Libraries

In [None]:
import csv
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import random

# For reproducability
seed = 2003
random.seed(seed)
np.random.seed(seed)

# Import Datasets

In [47]:
assessments = pd.read_csv(r'..\..\anonymisedData\assessments.csv')
courses = pd.read_csv(r'..\..\anonymisedData\courses.csv')
studentAssessment = pd.read_csv(r'..\..\anonymisedData\studentAssessment.csv')
studentInfo = pd.read_csv(r'..\..\anonymisedData\studentInfo.csv')
studentRegistration = pd.read_csv(r'..\..\anonymisedData\studentRegistration.csv')
studentVle = pd.read_csv(r'..\..\anonymisedData\studentVle.csv')
vle = pd.read_csv(r'..\..\anonymisedData\vle.csv')

# assessments
# courses
# studentAssessment
# studentInfo
# studentRegistration
# studentVle
# vle

In [None]:
# assessments
# courses
# studentAssessment
# studentInfo
# studentRegistration
# studentVle
# vle

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail
32589,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction
32590,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass
32591,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn


# Defined Functions

In [None]:
def compare_df_cols(series1, series2):
    """
    Compare two pandas Series and identify values present in series1 but missing in series2.
    Counts how many times each missing value appears in series1.
    
    Args:
        series1 (pd.Series): The first series to compare (e.g., list of student IDs from registration).
        series2 (pd.Series): The second series to compare against (e.g., list of student IDs from assessment).
    
    Prints:
        Total number of missing rows (values in series1 not found in series2).
        Number of unique missing values.
        The missing values along with their occurrence counts in series1.
    """
    missing_values = series1[~series1.isin(series2)] # Find values in series1 not in series2.
    missing_counts = missing_values.value_counts() # Count how many times each missing ID appears.

    print(f'Total missing rows: {missing_counts.sum()}') # Total rows missing.
    print(f'Number of unique missing IDs: {len(missing_counts)}')  # Unique missing IDs.
    print('Missing IDs and their counts:')
    print(missing_counts)

# Analyse and Preprocess Dataset

## Courses Table

This table provides a list of all modules and their corresponding presentations. It includes the following columns:

* **code\_module**: the identifier for each module.
* **code\_presentation**: the identifier for each presentation, combining the year with either "B" (for February starts) or "J" (for October starts).
* **length**: the duration of the module presentation in days.

Since the structure of B and J presentations may vary, it is advisable to analyse them separately. However, in some cases, one type of presentation (B or J) may not have a counterpart from the previous cycle. In such instances, specifically for the CCC, EEE, and GGG modules, the available J presentation may need to be used to inform the B presentation, or vice versa.

In [31]:
courses.head()

Unnamed: 0,code_module,code_presentation,module_presentation_length
0,AAA,2013J,268
1,AAA,2014J,269
2,BBB,2013J,268
3,BBB,2014J,262
4,BBB,2013B,240


### Missing, Duplicate, and Distinct Values Overview

In [32]:
print('Total null values: \n', courses.isnull().sum(), '\n')
print('Total duplicated values:', courses[courses.duplicated()], '\n')
print('Total unique values: \n', courses.nunique())

Total null values: 
 code_module                   0
code_presentation             0
module_presentation_length    0
dtype: int64 

Total duplicated values: Empty DataFrame
Columns: [code_module, code_presentation, module_presentation_length]
Index: [] 

Total unique values: 
 code_module                   7
code_presentation             4
module_presentation_length    7
dtype: int64


### Data Types and Schema Overview

In [33]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 3 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   code_module                 22 non-null     object
 1   code_presentation           22 non-null     object
 2   module_presentation_length  22 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 660.0+ bytes


## Assessments Table

This table contains details about assessments associated with different module presentations. Each presentation generally includes several assessments and concludes with a final exam. The CSV table includes the following columns:

* **code\_module**: The identifier for the module to which the assessment belongs.
* **code\_presentation**: The identifier for the specific presentation of the module.
* **id\_assessment**: A unique identifier for each assessment.
* **assessment\_type**: The category of the assessment, which can be one of the following: Tutor Marked Assessment (TMA), Computer Marked Assessment (CMA), or Final Exam (Exam).
* **date**: The submission deadline for the assessment, expressed as the number of days from the start of the presentation (which begins at day 0).
* **weight**: The contribution of the assessment to the overall module grade, given as a percentage. Exams typically carry a separate weight of 100%, while the combined weight of all other assessments is also 100%. If the exam date is not provided, it is assumed to occur at the end of the final presentation week.

In [3]:
assessments.head()

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0


### Missing, Duplicate, and Distinct Values Overview

In [12]:
print('Total null values: \n', assessments.isnull().sum(), '\n')
print('Total duplicated values:', assessments[assessments.duplicated()], '\n')
print('Total unique values: \n', assessments.nunique())

Total null values: 
 code_module           0
code_presentation     0
id_assessment         0
assessment_type       0
date                 11
weight                0
dtype: int64 

Total duplicated values: Empty DataFrame
Columns: [code_module, code_presentation, id_assessment, assessment_type, date, weight]
Index: [] 

Total unique values: 
 code_module            7
code_presentation      4
id_assessment        206
assessment_type        3
date                  74
weight                24
dtype: int64


There are 11 assessments with missing submission dates. The dataset includes:

* **7 distinct modules**
* **4 different presentations**: 2013J, 2013B, 2014J, and 2014B. The year indicates when the presentation took place, while "B" represents a February start and "J" an October start.
* **A total of 206 assessments** distributed across all modules
* **3 types of assessments**: Tutor Marked Assessments (TMA), Computer Marked Assessments (CMA), and Final Exams (Exam)

### Data Types and Schema Overview

In [13]:
assessments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code_module        206 non-null    object 
 1   code_presentation  206 non-null    object 
 2   id_assessment      206 non-null    int64  
 3   assessment_type    206 non-null    object 
 4   date               195 non-null    float64
 5   weight             206 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 9.8+ KB


The `id_assessment` values are currently displayed as integers, which is inaccurate. We should convert them to the `object` data type instead.

In [14]:
assessments['id_assessment'] = assessments['id_assessment'].astype(object)

### other

Project brief states that typically, exams have a weight of 100 and the sum of all other assessments is 100. This would man that a module with one exam only would have a weight of 100 and a module with one exam and some assessments would have a weight of 200. Let’s check if this so in the table provided.


In [16]:
# Group by module presentation and sum the weights of assessments
assessments.groupby(['code_module','code_presentation']).agg(total_weight = ('weight',sum))

  assessments.groupby(['code_module','code_presentation']).agg(total_weight = ('weight',sum))


Unnamed: 0_level_0,Unnamed: 1_level_0,total_weight
code_module,code_presentation,Unnamed: 2_level_1
AAA,2013J,200.0
AAA,2014J,200.0
BBB,2013B,200.0
BBB,2013J,200.0
BBB,2014B,200.0
BBB,2014J,200.0
CCC,2014B,300.0
CCC,2014J,300.0
DDD,2013B,200.0
DDD,2013J,200.0


Here we can see most that module presentations have total weight of 200, apart from module CCC which is 300 and module GGG which is 100. Let's have a closer look.

In [17]:
# See what are the weights of exams in module presentations
assessments[assessments['assessment_type'] == 'Exam'].groupby(['code_module','code_presentation', 'assessment_type']).agg(total_weight = ('weight',sum))

  assessments[assessments['assessment_type'] == 'Exam'].groupby(['code_module','code_presentation', 'assessment_type']).agg(total_weight = ('weight',sum))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_weight
code_module,code_presentation,assessment_type,Unnamed: 3_level_1
AAA,2013J,Exam,100.0
AAA,2014J,Exam,100.0
BBB,2013B,Exam,100.0
BBB,2013J,Exam,100.0
BBB,2014B,Exam,100.0
BBB,2014J,Exam,100.0
CCC,2014B,Exam,200.0
CCC,2014J,Exam,200.0
DDD,2013B,Exam,100.0
DDD,2013J,Exam,100.0


All modules show weight of 100 for exams apart from module CCC (for both presentations). Let's count the exams in each module presentation.

In [18]:
# Count how many exams there are in every module presentation
assessments[assessments['assessment_type'] == 'Exam'][['code_module', 'code_presentation', 'id_assessment']]\
.groupby(['code_module', 'code_presentation'])\
.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id_assessment
code_module,code_presentation,Unnamed: 2_level_1
AAA,2013J,1
AAA,2014J,1
BBB,2013B,1
BBB,2013J,1
BBB,2014B,1
BBB,2014J,1
CCC,2014B,2
CCC,2014J,2
DDD,2013B,1
DDD,2013J,1


Module CCC has two exams, this can explain the hight assessments weight for this module. Now let's have a look at all the assignments that are not exams and see if everything is as it should be.

In [19]:
# Sum the weights of all course work assignments per module presentation
assessments[assessments['assessment_type'] != 'Exam'].groupby(['code_module', 'code_presentation']).agg(total_weight = ('weight',sum))

  assessments[assessments['assessment_type'] != 'Exam'].groupby(['code_module', 'code_presentation']).agg(total_weight = ('weight',sum))


Unnamed: 0_level_0,Unnamed: 1_level_0,total_weight
code_module,code_presentation,Unnamed: 2_level_1
AAA,2013J,100.0
AAA,2014J,100.0
BBB,2013B,100.0
BBB,2013J,100.0
BBB,2014B,100.0
BBB,2014J,100.0
CCC,2014B,100.0
CCC,2014J,100.0
DDD,2013B,100.0
DDD,2013J,100.0


Here we see that module GGG doesn't have any weight to its assignments. Is it because there's no assingments for this module?

In [20]:
assessments[assessments['code_module'] == 'GGG'].groupby(['code_module','code_presentation', 'assessment_type']).agg(weight_by_type = ('weight', sum))

  assessments[assessments['code_module'] == 'GGG'].groupby(['code_module','code_presentation', 'assessment_type']).agg(weight_by_type = ('weight', sum))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,weight_by_type
code_module,code_presentation,assessment_type,Unnamed: 3_level_1
GGG,2013J,CMA,0.0
GGG,2013J,Exam,100.0
GGG,2013J,TMA,0.0
GGG,2014B,CMA,0.0
GGG,2014B,Exam,100.0
GGG,2014B,TMA,0.0
GGG,2014J,CMA,0.0
GGG,2014J,Exam,100.0
GGG,2014J,TMA,0.0


Are there any other CMA and TMA assignments with a weight of 0?

In [21]:
assessments[(assessments['assessment_type'] == 'TMA') & (assessments['weight'] == 0)]

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
48,BBB,2014J,15020,TMA,19.0,0.0
182,GGG,2013J,37415,TMA,61.0,0.0
183,GGG,2013J,37416,TMA,124.0,0.0
184,GGG,2013J,37417,TMA,173.0,0.0
192,GGG,2014B,37425,TMA,61.0,0.0
193,GGG,2014B,37426,TMA,117.0,0.0
194,GGG,2014B,37427,TMA,166.0,0.0
202,GGG,2014J,37435,TMA,61.0,0.0
203,GGG,2014J,37436,TMA,124.0,0.0
204,GGG,2014J,37437,TMA,173.0,0.0


## Student Assessment Table

This table records the results of students' assessments. If a student does not submit an assessment, no result is captured. Similarly, final exam data is missing if the assessment result was not recorded in the system. The table includes the following columns:

* **id\_assessment**: unique identifier for the assessment.
* **id\_student**: unique identifier for each student.
* **date\_submitted**: the number of days since the start of the module presentation when the student submitted the assessment.
* **is\_banked**: a flag indicating whether the result was carried over from a previous presentation.
* **score**: the student’s score for the assessment, ranging from 0 to 100. Scores below 40 are considered a fail.

In [22]:
studentAssessment.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,1752,11391,18,0,78.0
1,1752,28400,22,0,70.0
2,1752,31604,17,0,72.0
3,1752,32885,26,0,69.0
4,1752,38053,19,0,79.0


In [26]:
studentAssessment.describe()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
count,173912.0,173912.0,173912.0,173912.0,173739.0
mean,26553.803556,705150.7,116.032942,0.010977,75.799573
std,8829.784254,552395.2,71.484148,0.104194,18.798107
min,1752.0,6516.0,-11.0,0.0,0.0
25%,15022.0,504429.0,51.0,0.0,65.0
50%,25359.0,585208.0,116.0,0.0,80.0
75%,34883.0,634498.0,173.0,0.0,90.0
max,37443.0,2698588.0,608.0,1.0,100.0


### Missing, Duplicate, and Distinct Values Overview

In [25]:
print('Total null values: \n', studentAssessment.isnull().sum(), '\n')
print('Total duplicated values:', studentAssessment[studentAssessment.duplicated()], '\n')
print('Total unique values: \n', studentAssessment.nunique())

Total null values: 
 id_assessment       0
id_student          0
date_submitted      0
is_banked           0
score             173
dtype: int64 

Total duplicated values: Empty DataFrame
Columns: [id_assessment, id_student, date_submitted, is_banked, score]
Index: [] 

Total unique values: 
 id_assessment       188
id_student        23369
date_submitted      312
is_banked             2
score               101
dtype: int64


There are 173 null scores and no duplicated data

### Data Types and Schema Overview

In [28]:
studentAssessment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id_assessment   173912 non-null  int64  
 1   id_student      173912 non-null  int64  
 2   date_submitted  173912 non-null  int64  
 3   is_banked       173912 non-null  int64  
 4   score           173739 non-null  float64
dtypes: float64(1), int64(4)
memory usage: 6.6 MB


In [29]:
studentAssessment['id_assessment'] = studentAssessment['id_assessment'].astype(object)
studentAssessment['id_student'] = studentAssessment['id_student'].astype(object)

Convert the ID fields as object

### Submission Issue

In [30]:
studentAssessment[studentAssessment['score'].isnull()]

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
215,1752,721259,22,0,
937,1754,260355,127,0,
2364,1760,2606802,180,0,
3358,14984,186780,77,0,
3914,14984,531205,26,0,
...,...,...,...,...,...
148929,34903,582670,241,0,
159251,37415,610738,87,0,
166390,37427,631786,221,0,
169725,37435,648110,62,0,


We have 173 rows with null scores. According to the data description, if a student does not submit an assessment, no result is recorded. Similarly, final exam entries are absent when the assessment results are not stored in the system. As such, any missing scores can be interpreted as non-submissions.

**However, it's somewhat strange that some entries with missing scores still have submission dates recorded. Typically, if an assessment hasn't been submitted, the submission date should also be missing. This inconsistency should ideally be clarified with the data providers.**

## Student Registration Table

This table captures the timing of student registrations for module presentations. For students who withdrew, the date of unregistration is also included. It contains five columns:

* **code\_module**: the module's identification code.
* **code\_presentation**: the presentation's identification code.
* **id\_student**: a unique identifier for each student.
* **date\_registration**: the number of days (relative to the start of the module presentation) when the student registered. A negative value (e.g., -30) indicates registration occurred before the presentation began.
* **date\_unregistration**: the number of days (relative to the start of the module presentation) when the student withdrew. This field is empty for students who completed the course. Those who withdrew are marked with "Withdrawal" in the `final_result` column of the `studentInfo.csv` table.


In [34]:
studentRegistration.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159.0,
1,AAA,2013J,28400,-53.0,
2,AAA,2013J,30268,-92.0,12.0
3,AAA,2013J,31604,-52.0,
4,AAA,2013J,32885,-176.0,


In [35]:
studentRegistration.describe()

Unnamed: 0,id_student,date_registration,date_unregistration
count,32593.0,32548.0,10072.0
mean,706687.7,-69.4113,49.757645
std,549167.3,49.260522,82.46089
min,3733.0,-322.0,-365.0
25%,508573.0,-100.0,-2.0
50%,590310.0,-57.0,27.0
75%,644453.0,-29.0,109.0
max,2716795.0,167.0,444.0


### Missing, Duplicate, and Distinct Values Overview

In [36]:
print('Total null values: \n', studentRegistration.isnull().sum(), '\n')
print('Total duplicated values:', studentRegistration[studentRegistration.duplicated()], '\n')
print('Total unique values: \n', studentRegistration.nunique())

Total null values: 
 code_module                0
code_presentation          0
id_student                 0
date_registration         45
date_unregistration    22521
dtype: int64 

Total duplicated values: Empty DataFrame
Columns: [code_module, code_presentation, id_student, date_registration, date_unregistration]
Index: [] 

Total unique values: 
 code_module                7
code_presentation          4
id_student             28785
date_registration        332
date_unregistration      416
dtype: int64


There are 28,785 unique students in total, but the dataset contains 32,548 records. This suggests that some students may have registered for a particular module or presentation, later deregistered, and either switched to a different module/presentation or retaken the module in a subsequent session.

### Data Types and Schema Overview

In [37]:
studentRegistration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code_module          32593 non-null  object 
 1   code_presentation    32593 non-null  object 
 2   id_student           32593 non-null  int64  
 3   date_registration    32548 non-null  float64
 4   date_unregistration  10072 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [38]:
studentRegistration['id_student'] = studentRegistration['id_student'].astype(object)

### Missing Student ID Check

Let us see if all student IDs in the `StudentRegistration` tables exist in the `StudentAssessment` table.

In [50]:
# Call the function 'compare_df_cols' with the student IDs series.
compare_df_cols(studentRegistration['id_student'], studentAssessment['id_student'])

Total missing rows: 5847
Number of unique missing IDs: 5416
Missing IDs and their counts:
id_student
546400     4
399863     4
387276     4
619887     3
835603     3
          ..
2108914    1
2093092    1
2051809    1
2003628    1
2640965    1
Name: count, Length: 5416, dtype: int64


We observe that approximately 5,847 student records are missing from the `StudentAssessment` table, corresponding to 5,416 unique student IDs. Next, let us perform a similar check for the `StudentInfo` table.

In [54]:
compare_df_cols(studentInfo['id_student'], studentAssessment['id_student'])

Total missing rows: 5847
Number of unique missing IDs: 5416
Missing IDs and their counts:
id_student
546400     4
399863     4
387276     4
619887     3
835603     3
          ..
2108914    1
2093092    1
2051809    1
2003628    1
2640965    1
Name: count, Length: 5416, dtype: int64


Similarly, there are 5,847 student records (5,416 unique students) present in the `StudentInfo` table but missing from the `StudentAssessment` table. Let us continue to investigate this further.

In [57]:
# Get unique id_student.
SR_ID = studentRegistration['id_student'].unique()
SI_ID = studentInfo['id_student'].unique()

# Compare the two lists.
difference = set(SR_ID).difference(set(SI_ID))
difference = len(difference)
difference

0

The missing student IDs across `studentRegistration` and `studentInfo` are the same students