# Coursework 1
## Part 3. Detecting problematic modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
DATA_PATH = "./Data"

In [3]:
df_dict = {}

for file_ in os.listdir(DATA_PATH):
    df_dict[file_] = pd.read_csv(os.path.join(DATA_PATH, file_))
    
df_dict["progresses.csv"].columns = ["progress_id", "student_id", "course_id"]

df_dict["students.csv"].columns = ['id_', 'student_id', 'city', 'birthday']
df_dict["students.csv"]["birthday"] = pd.to_datetime(df_dict["students.csv"]["birthday"], errors="coerce")

df_dict["courses.csv"].columns = ["index", "course_id", "course_title", "course_field"]
df_dict["courses.csv"].set_index("index", inplace=True)

dataset = pd.merge(left=df_dict["progresses.csv"],
                   right=df_dict["progress_phases.csv"],
                   on="progress_id",
                   how="outer")

dataset = pd.merge(left=dataset,
                   right=df_dict["courses.csv"],
                   on="course_id",
                   how="outer")

dataset = pd.merge(left=dataset,
                   right=df_dict["course_contents.csv"],
                   on=["course_id", "lesson_number", "module_number"],
                   how="outer")

dataset["start_date"] = (
    pd.to_datetime(dataset["start_date"], errors="coerce", utc=False)
    .dt.tz_localize(None)
    )

dataset["finish_date"] = (
    pd.to_datetime(dataset["finish_date"], errors="coerce", utc=False)
    .dt.tz_localize(None)
    )

dset_students = df_dict["students.csv"].copy(deep=True)
dset_students["age_years"] = (np.datetime64(str(2022)) - dset_students["birthday"]).astype("timedelta64[Y]")
#dset_students["student_age"] = dset_students["student_age"].astype("timedelta64[Y]")

print("Main dataset:", dataset.dtypes, dataset.columns, sep="\n", end="\n\n")
print("Student dataset:", dset_students.dtypes, dset_students.columns, sep="\n")

Main dataset:
progress_id              object
student_id               object
course_id                object
module_number           float64
lesson_number           float64
status                   object
start_date       datetime64[ns]
finish_date      datetime64[ns]
course_title             object
course_field             object
module_title             object
lesson_title             object
lesson_token             object
is_video                 object
is_homework              object
dtype: object
Index(['progress_id', 'student_id', 'course_id', 'module_number',
       'lesson_number', 'status', 'start_date', 'finish_date', 'course_title',
       'course_field', 'module_title', 'lesson_title', 'lesson_token',
       'is_video', 'is_homework'],
      dtype='object')

Student dataset:
id_                    int64
student_id            object
city                  object
birthday      datetime64[ns]
age_years            float64
dtype: object
Index(['id_', 'student_id', 'city', 'birth

### Making a course-module id to avoid nested groupby
because module is being identified with just a number

by joining course_id and module_number

In [28]:
dataset["course-module_id"] = (
    dataset["course_id"] +
    "_" +
    dataset["module_number"].astype("str")
)

dataset["course-module_id"].sample(n=10)

138482    abce125a877c2196a3bc7bfbc11b5fc5_2.0
324812    ccbab00e99b27f83369d6cc89d914ad6_1.0
144223    17013cd19d25cb3f28dc1b2683721bb9_2.0
197165    e9bb9205eeed307ee7cbaa08bfd166c3_6.0
323699    ccbab00e99b27f83369d6cc89d914ad6_1.0
39515     bf27a4bf4ada4c756451703ea62a914f_3.0
180347    dad6f6ffc086caa89e2f40c28a9c7490_1.0
196655    e9bb9205eeed307ee7cbaa08bfd166c3_2.0
46531     bf27a4bf4ada4c756451703ea62a914f_4.0
181184    dad6f6ffc086caa89e2f40c28a9c7490_8.0
Name: course-module_id, dtype: object

Сколько всего уникальных модулей?

In [29]:
dataset["course-module_id"].nunique()

230

### Time to complete module - minimum

In [39]:
for module_id_, module_gr_ in dataset[dataset["is_homework"] == True].groupby("course-module_id"):
    if module_gr_["lesson_number"].nunique() > 1:
        first_lesson_num = module_gr_["lesson_number"].min()
        last_lesson_num = module_gr_["lesson_number"].max()
        print(module_id_, first_lesson_num, last_lesson_num)

943306102e5b067d08a29094f37b8193_4.0 6.0 27.0
943306102e5b067d08a29094f37b8193_5.0 5.0 13.0
943306102e5b067d08a29094f37b8193_6.0 2.0 13.0
943306102e5b067d08a29094f37b8193_7.0 2.0 8.0
943306102e5b067d08a29094f37b8193_8.0 4.0 11.0
943306102e5b067d08a29094f37b8193_9.0 5.0 14.0
c9fcb746d51e41bc5a217002d081454b_4.0 3.0 5.0
dad6f6ffc086caa89e2f40c28a9c7490_9.0 7.0 8.0


### Time to complete module - maximum

### Time to complete module - average

### Time to complete module - median