# Application of doc2vec for edX MOOCs dataset


## Introduction

This is the first part of the analysis applied to the corpus of edX MOOCs.

We trained doc2vec model on the corpus collected using https://github.com/TokyoTechX/web-crawler.

## Loading the corpus

We specify the directory where the textual data of 285 courses is stored. 

Each MOOC contains textual data from its html, video transcript and assessment components.

In [1]:
foldername = "/home/zarina/Documents/OEDO/web-crawler/HTMLs"

## Preprocessing

The data was preprocessed so that all the punctuation, numerical symbols and stop words were removed.

In [2]:
import os,re,json
import codecs,string
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize,wordpunct_tokenize
from collections import Counter 
from nltk.corpus import stopwords

def course_list(foldername,comp_type_code):
    
    courses = os.listdir(foldername)     
    selected_comp_list = comp_type_selection(comp_type_code)
    print(selected_comp_list)
    course_path = []
    for idx,course in enumerate(courses):
        course_path.append(os.path.join(foldername,course))

    return selected_comp_list,course_path

def comp_type_selection(code):

    selected_comp = []
    if code[0] == '1':
        selected_comp.append('all_textcomp.json')
    if code[1] == '1':    
        selected_comp.append('all_videocomp.json')
    if code[2] == '1':    
        selected_comp.append('all_probcomp.json')
        
    return selected_comp

def text_preprocessing(raw,code):
    
    list_of_words = []
    raw = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",'', raw)
    token = wordpunct_tokenize(raw)
    stopword_obj = stopwords.words('english')
    list_of_words=[]
    steming_type = PorterStemmer()
    for i in token:
        processed_tmp = i.lower()
        
        if len(processed_tmp) == 1:
            continue
        
        if code[0] == '1':
            if not processed_tmp.isalpha():
                continue
        else:
            if not processed_tmp.isalnum():
                continue

        if code[1] == '1':
            if processed_tmp in stopword_obj:
                continue
     
        if code[2] == '1':
            processed_tmp = steming_type.stem(processed_tmp)
        
        list_of_words.append(processed_tmp)

    return list_of_words
    
def extract_text_from_component(foldername,comp_type_code,processing_code):
    
    selected_comp_types, all_course_paths = course_list(foldername,comp_type_code)
    clean_text_set =  dict()
    raw_text_set = dict()
    course_name_set = []
    for course_path in all_course_paths:
        course_name = os.path.basename(course_path)
        all_text = []
        all_raw_text = []
        for comp_type in selected_comp_types:
            with open(os.path.join(course_path,comp_type),'r',encoding='utf-8') as file:
                if comp_type == 'all_videocomp.json':
                    dict_parser = json.loads(file.read())
                    for main_key, main_value in dict_parser.items():
                        if type(main_value['transcript_en']) is dict:  
                            continue
                        raw_txt = main_value['transcript_en']
                        clean_text = text_preprocessing(raw_txt,processing_code)
                        if not clean_text:
                            continue
                        all_text += clean_text
                else:
                    dict_parser = json.loads(file.read())
                    for main_key, main_value in dict_parser.items():
                        raw_txt = main_value['content']
                        all_raw_text.append(raw_txt)
                        clean_text = text_preprocessing(raw_txt,processing_code)
                        if not clean_text:
                            continue
                        all_text += clean_text

        clean_text_set[course_name] = all_text
        raw_text_set[course_name] = all_raw_text
        course_name_set.append(course_name)
        
    return course_name_set,clean_text_set,raw_text_set

In [3]:
comp_type_code = '110'  # 1st digit = TEXTCOMP (1),  2ND digit = VIDEOCOMP(1), 3RD digit = PROBCOMP(1):  ex. 100 = only textcomp active 
processing_code = '110' # 1st digit = alpha(1),alphanumeric(0),  2ND digit = filter stopword(1) , 3RD digit = do stemming(1)

course_name_set,clean_text_set,raw_text_set=extract_text_from_component(foldername,comp_type_code,processing_code)

['all_textcomp.json', 'all_videocomp.json']


Loading the course labels (categories) from csv file. The labels are used for the classification task later.

In [4]:
import pandas as pd

colnames = ['course', 'subject']
df = pd.read_csv('test.csv')
csv_map=dict(zip(list(df.course), list(df.subject)))

Load text data into the memory.

In [57]:
from gensim.models.doc2vec import TaggedDocument

all_courses = []
course_to_cat={}
course_cnt = 0
categories = set()

for cur_name in course_name_set:
    cur_name_csv=cur_name.replace('-_', '-').replace('_', ' ') # remove discrepancy in course labels
    if cur_name_csv in csv_map:
        if course_cnt==0:
            print (cur_name)
        cat=csv_map[cur_name_csv]
        categories.add(cat)
        doc_i=TaggedDocument(clean_text_set[cur_name], [cur_name])
        all_courses.append(doc_i)
        course_cnt+=1
        course_to_cat[cur_name]=cat
    
#input to doc2vec
doc_list = all_courses[:]

Global_Social_Change


## Seting up doc2vec model

We use two models:
* doc2vec based on DBOW (dm=0)
* doc2vec based on DM (dm=1)
    
with the document vector length of 200, and context window size of 8.

In [17]:
import gensim
from gensim.models import Doc2Vec
import multiprocessing

cores = multiprocessing.cpu_count()

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, min_count=19, epochs=20, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=19, epochs =20, workers=cores),
]

models[0].build_vocab(doc_list)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))
for model in models:
    %%time model.train(doc_list, total_examples=model.corpus_count, epochs=model.epochs)

Doc2Vec(dbow+w,d200,n5,w8,mc19,s0.001,t8)
Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t8)
CPU times: user 25min 34s, sys: 1.04 s, total: 25min 35s
Wall time: 3min 20s
CPU times: user 4min 58s, sys: 488 ms, total: 4min 59s
Wall time: 41.7 s


## Calculating similarity between MOOCs

In [45]:
#checks
for model in models:
    print (model, '\n')
    sims = model.docvecs.most_similar(course_name_set[1])
    print ("Most similar courses to ", course_name_set[1], " are: \n",  sims, '\n')


Doc2Vec(dbow+w,d200,n5,w8,mc19,s0.001,t8) 

Most similar courses to  Compliance_in_Office_365-_Data_Governance  are: 
 [('Compliance_in_Office_365-_eDiscovery', 0.6079649925231934), ('Microsoft_SharePoint_2016-_Search_and_Content_Management', 0.4997135102748871), ('Office_365-_SharePoint_Online_Administrator', 0.496964693069458), ('Provisioning_Office_365_Services', 0.469723641872406), ('Microsoft_SharePoint_2016-_Authentication_and_Security', 0.46946612000465393), ('Microsoft_SharePoint_2016-_Workload_Optimization', 0.46710318326950073), ('Microsoft_SharePoint_2016-_Infrastructure', 0.43522870540618896), ('Microsoft_SharePoint_Online_for_Site_Administrators', 0.4127456545829773), ('Configuring_SharePoint_Hybrid', 0.393161416053772), ('Microsoft_SharePoint_2016-_Productivity_Solutions', 0.39244675636291504)] 

Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t8) 

Most similar courses to  Compliance_in_Office_365-_Data_Governance  are: 
 [('Compliance_in_Office_365-_eDiscovery', 0.6992765665054321)

We can also compare similarities between words in the corpus.

In [53]:
print (model0.wv['java'][0:4])
print (model0.wv['python'][0:4])
model0.wv.similarity('java', 'python')

[ 0.20589416  0.49491093 -0.11974067 -0.30048046]
[ 0.33363882  0.55245304  0.13124876 -0.10792442]


0.54820123721292613

## Saving and loading the trained models for fast inference

In [47]:
for i in range(len(models)):
    models[i].save("model" + str(i))

Loaded model can be retrained later

In [56]:
model0 = Doc2Vec.load("model0")

For more information on gensim doc2vec https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md