## Imports

In [1]:
import json
import os
from datetime import timedelta, datetime
from time import sleep
from sys import argv
import gensim
import gensim.corpora as corpora
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
import torch_geometric
from functools import reduce
from tqdm import tqdm
from Preprocessing import Preprocessor
preprocessor = Preprocessor(0)
tqdm.pandas()

  from imp import reload


## Check if data is already joined locally

In [2]:
joined_exists = os.path.exists('khan_joined.csv')
khan = pd.DataFrame([])
if joined_exists:
    khan = pd.read_csv('khan_joined.csv')
khan.head(5)

Unnamed: 0,course,unit,lesson,video_title,about,transcript,topic,transcript_cleaned,tokens,transcript_n_entries
0,computer,Intro to JS: Drawing & Animation,Intro to programming,What is Programming?,Programming is the process of creating a set o...,"Hi, welcome to programming! If you've never le...",Computing,"hi, welcome programming! never learned program...","['hi,', 'welcome', 'programming!', 'never', 'l...",183
1,computer,Intro to JS: Drawing & Animation,Coloring,The Power of the Docs,Created by Pamela Fox.,Voiceover: Ok so you've\r\nmade a few programs...,Computing,"voiceover: ok made programs, might wondering e...","['voiceover:', 'ok', 'made', 'programs,', 'mig...",542
2,computer,Intro to HTML/CSS: Making webpages,Further learning,HTML validation,Learn how to validate your webpages with the W...,"- [Voiceover] On Khan Academy, we pop up the o...",Computing,"- khan academy, pop oh noes guide tell somethi...","['-', 'khan', 'academy,', 'pop', 'oh', 'noes',...",172
3,computer,Intro to SQL: Querying and managing data,SQL basics,Welcome to SQL,SQL is useful for creating and querying relati...,- [Instructor] The world is full of data. Ever...,Computing,- world full data. every app use full data. kh...,"['-', 'world', 'full', 'data.', 'every', 'app'...",203
4,computer,Intro to SQL: Querying and managing data,SQL basics,S-Q-L or SEQUEL?,How is it pronounced? Why? Let's discuss...,"At this point, you've probably heard me\r\npro...",Computing,"point, probably heard pronounce sql two ways--...","['point,', 'probably', 'heard', 'pronounce', '...",129


## Load each domain separately if not already joined

In [3]:
if not joined_exists:
    computing = pd.read_csv("Datasets\\KhanAcademy\\Computing.csv")
    computing = computing.dropna()
    computing['topic'] = 'Computing'
    computing.info()

In [4]:
if not joined_exists:
    economics = pd.read_csv("Datasets\\KhanAcademy\\Economics.csv")
    economics = economics.dropna()
    economics['topic'] = 'Economics'
    economics.info()

In [5]:
if not joined_exists:
    humanities = pd.read_csv("Datasets\\KhanAcademy\\Humanities.csv")
    humanities = humanities.dropna()
    humanities['topic'] = 'Humanities'
    humanities.info()

In [6]:
if not joined_exists:
    math = pd.read_csv("Datasets\\KhanAcademy\\Math.csv")
    math = math.dropna()
    math['topic'] = 'Math'
    math.info()

In [7]:
if not joined_exists:
    science = pd.read_csv("Datasets\\KhanAcademy\\Science.csv")
    science = science.dropna()
    science['topic'] = 'Science'
    science.info()

## Display the unique courses for each domain

In [8]:
if not joined_exists:
    computing['course'].value_counts()

In [9]:
if not joined_exists:
    economics['course'].value_counts()

In [10]:
if not joined_exists:
    humanities['course'].value_counts()

In [11]:
if not joined_exists:
    math['course'].value_counts()

In [12]:
if not joined_exists:
    science['course'].value_counts()

## Join the domains into one file

In [13]:
if not joined_exists:
    khan_dfs = [computing, economics, humanities, math, science]
    khan = pd.concat(khan_dfs, axis=0)
    khan.info()

## Clean up some of the course so they can be used as ground truth labels

In [14]:
# if not joined_exists:
# remap the courses to more broad categories: https://stackoverflow.com/a/16476974
labels = ['physics', 'chemistry', 'biology', 'algebra', 'geometry', 'statistics', 'calculus', 'history', 'economics', 'computer']
for lbl in labels:
    # print(lbl)
    for index, row in khan.iterrows():
        if lbl in row['course'].lower():
            row['course'] = lbl
khan['course'].value_counts()

algebra                                         1287
calculus                                         947
chemistry                                        909
history                                          769
biology                                          757
physics                                          743
statistics                                       626
economics                                        502
geometry                                         345
Finance and capital markets                      317
Electrical engineering                           198
Pixar in a Box                                   175
US government and civics                         139
Cosmology and astronomy                           89
AP®︎/College US Government and Politics           87
Trigonometry                                      76
computer                                          75
Differential equations                            70
Storytelling                                  

In [15]:
khan.tail(2)

Unnamed: 0,course,unit,lesson,video_title,about,transcript,topic,transcript_cleaned,tokens,transcript_n_entries
8259,physics,Review for AP Physics 1 exam,AP Physics 1 free response questions 2015,Question 4: 2015 AP Physics 1 free response,Identical spheres falling from the same height...,- [Voiceover] Two identical\r\nspheres are rel...,Science,- two identical spheres released device time e...,"['-', 'two', 'identical', 'spheres', 'released...",673
8260,physics,Review for AP Physics 1 exam,AP Physics 1 free response questions 2015,Question 5: 2015 AP Physics 1 free response,Fundamental frequencies (first harmonics) of s...,- [Voiceover] The figure\r\nabove shows a stri...,Science,- figure shows string one end attached oscilla...,"['-', 'figure', 'shows', 'string', 'one', 'end...",901


## Clean up the transcripts using the preprocessor

In [16]:
if not joined_exists:
    # It takes 21 min to run SpaCy preprocessing over each record in the data
    khan['transcript_cleaned'] = khan['transcript'].progress_apply(lambda x: preprocessor.clean(x, fast=True))
    print(khan['transcript_cleaned'][0])

## Tokenize the transcript using the preprocessor

In [17]:
if not joined_exists:
    # clear up the lists to be unique only.
    khan['tokens'] = khan['transcript'].progress_apply(lambda x: preprocessor.clean(x, tokenize=True, fast=True))
    print(len(khan['tokens'][0]))
    print(khan['tokens'][0])

In [18]:
if not joined_exists:
# if not False:
    khan['transcript_n_entries'] = khan['tokens'].progress_apply(lambda x: len(x))
    khan.head(5)

In [19]:
khan.head(1)

Unnamed: 0,course,unit,lesson,video_title,about,transcript,topic,transcript_cleaned,tokens,transcript_n_entries
0,computer,Intro to JS: Drawing & Animation,Intro to programming,What is Programming?,Programming is the process of creating a set o...,"Hi, welcome to programming! If you've never le...",Computing,"hi, welcome programming! never learned program...","['hi,', 'welcome', 'programming!', 'never', 'l...",183


## Save the results to a local file to speed up rerunning further documents

In [20]:
if not joined_exists:
# if not False:
    khan.to_csv("khan_joined.csv", index=False)

## Train a simple unsupervised LDA model on the transcripts

In [21]:
num_topics = len(khan['course'].value_counts())
print(num_topics)

22


In [22]:
print(type(khan['tokens'][0]))

<class 'str'>


In [23]:
%%time
# LDA Normal
# temp = [d.split() for d in corpus]
temp = [doc.split() for doc in khan['tokens']]
print(type(temp))
words = corpora.Dictionary(temp)
words.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [words.doc2bow(doc) for doc in temp]

<class 'list'>
CPU times: total: 4.34 s
Wall time: 4.34 s


In [24]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=num_topics,
                                           random_state=2,
                                           update_every=1,
                                           passes=15,
                                           alpha='auto')

CPU times: total: 1min 50s
Wall time: 1min 44s


## Prepare a visualization of the LDA model for investigation

In [25]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, words, mds='mmds', R=10)

  default_term_info = default_term_info.sort_values(


In [26]:
vis

## Run a simple prediction as a test

In [28]:
new_text_corpus = words.doc2bow(khan['transcript'][0].split())
print(len(new_text_corpus))
prediction = lda_model.get_document_topics(new_text_corpus)
prediction.sort(key = lambda x: x[1], reverse = True)
print(prediction)
prediction = prediction[0][0]
print("Predicted Topic: %d" % prediction)
lda_model.show_topic(prediction)

0
[(2, 0.098402314), (21, 0.0809694), (18, 0.075872764), (17, 0.073218636), (15, 0.06998841), (3, 0.06832001), (5, 0.056444004), (4, 0.053440616), (6, 0.04724776), (12, 0.042431854), (9, 0.040600665), (19, 0.03880761), (11, 0.03551189), (20, 0.03448599), (7, 0.03177102), (16, 0.028411347), (1, 0.02583002), (0, 0.0225086), (14, 0.021515336), (10, 0.02058558), (13, 0.018438345), (8, 0.015197829)]
Predicted Topic: 2


[("'maybe',", 0.02533247),
 ("'kind',", 0.01516013),
 ("'say,',", 0.012839129),
 ("'things',", 0.0116827525),
 ("'lot',", 0.009117131),
 ("'is,',", 0.008418298),
 ("'know,',", 0.008062178),
 ("'whole',", 0.007447749),
 ("'essentially',", 0.0072026076),
 ("'it,',", 0.006912621)]