# Coursework 1
## Part 3. Detecting problematic modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
DATA_PATH = "./Data"

In [3]:
df_dict = {}

for file_ in os.listdir(DATA_PATH):
    df_dict[file_] = pd.read_csv(os.path.join(DATA_PATH, file_))
    
df_dict["progresses.csv"].columns = ["progress_id", "student_id", "course_id"]

df_dict["students.csv"].columns = ['id_', 'student_id', 'city', 'birthday']
df_dict["students.csv"]["birthday"] = pd.to_datetime(df_dict["students.csv"]["birthday"], errors="coerce")

df_dict["courses.csv"].columns = ["index", "course_id", "course_title", "course_field"]
df_dict["courses.csv"].set_index("index", inplace=True)

dataset = pd.merge(left=df_dict["progresses.csv"],
                   right=df_dict["progress_phases.csv"],
                   on="progress_id",
                   how="outer")

dataset = pd.merge(left=dataset,
                   right=df_dict["courses.csv"],
                   on="course_id",
                   how="outer")

dataset = pd.merge(left=dataset,
                   right=df_dict["course_contents.csv"],
                   on=["course_id", "lesson_number", "module_number"],
                   how="outer")

dataset["start_date"] = (
    pd.to_datetime(dataset["start_date"], errors="coerce", utc=False)
    .dt.tz_localize(None)
    )

dataset["finish_date"] = (
    pd.to_datetime(dataset["finish_date"], errors="coerce", utc=False)
    .dt.tz_localize(None)
    )

dset_students = df_dict["students.csv"].copy(deep=True)
dset_students["age_years"] = (np.datetime64(str(2022)) - dset_students["birthday"]).astype("timedelta64[Y]")
#dset_students["student_age"] = dset_students["student_age"].astype("timedelta64[Y]")

print("Main dataset:", dataset.dtypes, dataset.columns, sep="\n", end="\n\n")
print("Student dataset:", dset_students.dtypes, dset_students.columns, sep="\n")

Main dataset:
progress_id              object
student_id               object
course_id                object
module_number           float64
lesson_number           float64
status                   object
start_date       datetime64[ns]
finish_date      datetime64[ns]
course_title             object
course_field             object
module_title             object
lesson_title             object
lesson_token             object
is_video                 object
is_homework              object
dtype: object
Index(['progress_id', 'student_id', 'course_id', 'module_number',
       'lesson_number', 'status', 'start_date', 'finish_date', 'course_title',
       'course_field', 'module_title', 'lesson_title', 'lesson_token',
       'is_video', 'is_homework'],
      dtype='object')

Student dataset:
id_                    int64
student_id            object
city                  object
birthday      datetime64[ns]
age_years            float64
dtype: object
Index(['id_', 'student_id', 'city', 'birth

### Questions

* how many unfinished modules?
* duration of each lesson
* duration of spaces between lessons?
* total module duration divided by number of homeworks
* difference between mean and median time

### Making a course-module id to avoid nested groupby
because module is being identified with just a number

by joining course_id and module_number

In [28]:
dataset["course-module_id"] = (
    dataset["course_id"] +
    "_" +
    dataset["module_number"].astype("str")
)

dataset["course-module_id"].sample(n=10)

138482    abce125a877c2196a3bc7bfbc11b5fc5_2.0
324812    ccbab00e99b27f83369d6cc89d914ad6_1.0
144223    17013cd19d25cb3f28dc1b2683721bb9_2.0
197165    e9bb9205eeed307ee7cbaa08bfd166c3_6.0
323699    ccbab00e99b27f83369d6cc89d914ad6_1.0
39515     bf27a4bf4ada4c756451703ea62a914f_3.0
180347    dad6f6ffc086caa89e2f40c28a9c7490_1.0
196655    e9bb9205eeed307ee7cbaa08bfd166c3_2.0
46531     bf27a4bf4ada4c756451703ea62a914f_4.0
181184    dad6f6ffc086caa89e2f40c28a9c7490_8.0
Name: course-module_id, dtype: object

How many unique modules?

In [29]:
dataset["course-module_id"].nunique()

230

### Time to complete module - minimum

modules having 1 homework: 1 student = 1 progress ?

2 or more homeworks: 1 student = many progresses ?

progress / student ratio

In [97]:
temp_list = []

for module_id_, module_gr_ in dataset[dataset["is_homework"] == True].groupby("course-module_id"):
    
    temp_list.append(module_gr_["progress_id"].nunique() / module_gr_["student_id"].nunique())
    
pd.Series(temp_list).unique()

array([1.])

In [85]:
data_agg = pd.DataFrame()

for module_id_, module_gr_ in dataset[dataset["is_homework"] == True].groupby("course-module_id"):
    
    if module_gr_["lesson_number"].nunique() == 1:
        
        average_time = (module_gr_["finish_date"] - module_gr_["start_date"]).mean()
        median_time = (module_gr_["finish_date"] - module_gr_["start_date"]).median()
        
        temp_dict = {
        "course-module_id" : module_id_,
        "course_title" : module_gr_["course_title"].iloc[0],
        "module_number" : module_gr_["module_number"].iloc[0],
        "average_time" : average_time,
        "median_time" : median_time
        }
        data_agg = data_agg.append(temp_dict, ignore_index=True)
        
    else:
        
        first_hw_num = module_gr_["lesson_number"].min()
        last_hw_num = module_gr_["lesson_number"].max()
        
        #print(module_id_, first_hw_num, last_hw_num, module_gr_["student_id"].nunique())
        
data_agg.set_index("course-module_id", inplace=True)
data_agg

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0_level_0,course_title,module_number,average_time,median_time
course-module_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
04ba6d0b40670c43a209141fa01fa784_1.0,Анимация интерфейсов,1.0,11 days 19:41:17.659012238,1 days 22:31:56.639415
04ba6d0b40670c43a209141fa01fa784_10.0,Анимация интерфейсов,10.0,13 days 09:09:49.877145857,6 days 01:01:22.889799500
04ba6d0b40670c43a209141fa01fa784_11.0,Анимация интерфейсов,11.0,26 days 07:41:06.099055628,12 days 23:33:59.876374
04ba6d0b40670c43a209141fa01fa784_12.0,Анимация интерфейсов,12.0,NaT,NaT
04ba6d0b40670c43a209141fa01fa784_13.0,Анимация интерфейсов,13.0,24 days 15:00:26.129520963,9 days 17:02:13.213271
...,...,...,...,...
e9bb9205eeed307ee7cbaa08bfd166c3_5.0,JavaScript с нуля,5.0,17 days 10:51:14.284782232,10 days 00:24:29.275534
e9bb9205eeed307ee7cbaa08bfd166c3_6.0,JavaScript с нуля,6.0,16 days 07:50:56.554498042,8 days 13:22:11.486400
e9bb9205eeed307ee7cbaa08bfd166c3_7.0,JavaScript с нуля,7.0,20 days 16:23:08.435682182,11 days 05:01:06.596879
e9bb9205eeed307ee7cbaa08bfd166c3_8.0,JavaScript с нуля,8.0,14 days 09:55:40.627778645,5 days 20:00:10.100321


### Time to complete module - maximum

### Time to complete module - average

### Time to complete module - median