# Evaluating OER materials available in OPAL

March 2022

In [1]:
import pandas as pd
import json
from pathlib import Path

## Data preparation

In [3]:
data_folder = "../data"
json_file_name = "content.json"
input_file = Path.cwd() / data_folder / json_file_name

with open(input_file) as json_data:
    raw_data = json.load(json_data)

In [4]:
ressource_types = {"files": "file", 
                   "learning_resources": "lr"}

oer_data_set = list()
for ressource_type in ressource_types:
    data = raw_data[ressource_type]
    print(f"{ressource_type} - {len(data)} entries")
    for entry in data:
        entry['type'] = ressource_types[ressource_type]
        oer_data_set.append(entry)

files - 11322 entries
learning_resources - 3828 entries


In [5]:
df = pd.DataFrame(oer_data_set)

## Data analysis

### Which parameters are available?

In [6]:
df[df.type=="file"].columns

Index(['filename', 'license', 'oer_permalink', 'title', 'comment', 'creator',
       'publisher', 'source', 'city', 'publicationMonth', 'publicationYear',
       'pages', 'language', 'url', 'act', 'appId', 'category', 'chapter',
       'duration', 'mediaType', 'nav1', 'nav2', 'nav3', 'series', 'type',
       'displayname', 'resourcetype', 'modulenumber', 'modulename',
       'moduleresponsible', 'authorinstitution', 'modulelink', 'moduleversion',
       'moduleects', 'moduleworkloadtotal', 'moduleworkloadpresence',
       'moduleworkloadprivate', 'modulemethods', 'moduleexam', 'modulelevel',
       'moduleduration', 'olatlinkinteresting'],
      dtype='object')

In [7]:
df[df.type=="lr"].columns

Index(['filename', 'license', 'oer_permalink', 'title', 'comment', 'creator',
       'publisher', 'source', 'city', 'publicationMonth', 'publicationYear',
       'pages', 'language', 'url', 'act', 'appId', 'category', 'chapter',
       'duration', 'mediaType', 'nav1', 'nav2', 'nav3', 'series', 'type',
       'displayname', 'resourcetype', 'modulenumber', 'modulename',
       'moduleresponsible', 'authorinstitution', 'modulelink', 'moduleversion',
       'moduleects', 'moduleworkloadtotal', 'moduleworkloadpresence',
       'moduleworkloadprivate', 'modulemethods', 'moduleexam', 'modulelevel',
       'moduleduration', 'olatlinkinteresting'],
      dtype='object')

### Who generates OER in Saxony :-)

In [8]:
df.groupby(['publisher']).oer_permalink.count().reset_index()

Unnamed: 0,publisher,oer_permalink
0,,11214
1,Bildungstechnologie,1
2,Denis Keiling,2
3,ESRI,1
4,"IMB, MF, TU Dresden",1
5,Institut Berufspädagogik,1
6,Institute of Metallurgy and Materials Science ...,1
7,Landesbetrieb Geobasisinformation und Vermessu...,1
8,Patricia Kaden,2
9,Prof. Dr. Nadine Bergner,1


### Which licenses are used for materials?

In [9]:
df.groupby(['license']).oer_permalink.count().reset_index()

Unnamed: 0,license,oer_permalink
0,,3764
1,CC BY 4.0 Int.,1499
2,CC BY-NC 4.0 Int.,1675
3,CC BY-NC-ND 4.0 Int.,3950
4,CC BY-NC-SA 4.0 Int.,977
5,CC BY-ND 4.0 Int.,143
6,CC BY-SA 4.0 Int.,2308
7,CC0 1.0 Universell,834


### Which types of files are contained in OER collection?

In [10]:
def getLastValue(aList):
    if  isinstance(aList, list):
        return aList[-1]
    else:
        return "unknown"

# Extract file extension
df['file_type'] = df.filename.str.split('.').apply(getLastValue)

# Normalize different writing formates for file extensions
df['file_type'] = df['file_type'].str.lower()

In [11]:
type_statistics = df[df.type == "file"].groupby(['file_type'])["file_type"]\
                                       .count() \
                                       .reset_index(name='count') \
                                       .sort_values(['count'], ascending=False) \
                                       .head(20)

type_statistics['ratio'] = type_statistics["count"] / type_statistics["count"].sum()

In [12]:
type_statistics

Unnamed: 0,file_type,count,ratio
82,pdf,5242,0.494995
55,jpg,1040,0.098206
65,mkv,873,0.082436
69,mp4,586,0.055335
84,png,494,0.046648
127,zip,443,0.041832
47,html,387,0.036544
23,docx,376,0.035505
88,pptx,245,0.023135
123,xlsx,191,0.018036


In [16]:
year_statistics = df[df.type == "file"].groupby(['publicationYear'])["file_type"]\
                                       .count() \
                                       .reset_index(name='count') \
                                       .sort_values(['count'], ascending=False) \
                                       .head(20)
year_statistics

Unnamed: 0,publicationMonth,count
0,,11216
9,6.0,20
7,4.0,17
3,11.0,14
8,5.0,12
10,7.0,10
2,10.0,8
1,1.0,6
5,2.0,5
6,3.0,5
