In [1]:
# !nvidia-smi

# Add this in a Google Colab cell to install the correct version of Pytorch Geometric.
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
# CUDA = format_cuda_version(CUDA_version)

In [6]:
import json
import os
# import uuid  # https://docs.python.org/3/library/uuid.html
# import structlog  # for event logging
# # from dotenv import load_dotenv # enviornment vars if we want

# from pygtail import Pygtail
# import boto3
# from minio import Minio
# from dotenv import load_dotenv
from datetime import timedelta, datetime
from time import sleep
from sys import argv
# import threading
# from smart_open import smart_open
import gensim
import gensim.corpora as corpora

# setup to import the preprocessor
import sys
# from botocore.errorfactory import ClientError
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
import torch_geometric
from functools import reduce
from tqdm import tqdm

from Preprocessing import Preprocessor
preprocessor = Preprocessor(0)
tqdm.pandas()


In [None]:
joined_exists = os.path.exists('khan_joined.csv')
khan = pd.DataFrame([])
if joined_exists:
    khan = pd.read_csv('khan_joined.csv')
khan.head(5)

In [None]:
if not joined_exists:
    computing = pd.read_csv("Datasets\\KhanAcademy\\Computing.csv")
    computing = computing.dropna()
    computing.info()

In [None]:
if not joined_exists:
    economics = pd.read_csv("Datasets\\KhanAcademy\\Economics.csv")
    economics = economics.dropna()
    economics.info()

In [None]:
if not joined_exists:
    humanities = pd.read_csv("Datasets\\KhanAcademy\\Humanities.csv")
    humanities = humanities.dropna()
    humanities.info()

In [None]:
if not joined_exists:
    math = pd.read_csv("Datasets\\KhanAcademy\\Math.csv")
    math = math.dropna()
    math.info()

In [None]:
if not joined_exists:
    science = pd.read_csv("Datasets\\KhanAcademy\\Science.csv")
    science = science.dropna()
    science.info()

In [None]:
ted_main = pd.read_csv("Datasets\\TEDTalksDataset\\ted_main.csv")
transcripts = pd.read_csv("Datasets\\TEDTalksDataset\\transcripts.csv")
validation = ted_main.join(transcripts, lsuffix='url', rsuffix='url', sort=True)
validation = validation.dropna()
validation.info(verbose=True, show_counts=True)

In [None]:
if not joined_exists:
    computing['course'].value_counts()

In [None]:
if not joined_exists:
    economics['course'].value_counts()

In [None]:
if not joined_exists:
    humanities['course'].value_counts()

In [None]:
if not joined_exists:
    math['course'].value_counts()

In [None]:
if not joined_exists:
    science['course'].value_counts()

In [None]:
if not joined_exists:
    khan_dfs = [computing, economics, humanities, math, science]
    khan = pd.concat(khan_dfs, axis=0)
    khan.info()

In [None]:
if not joined_exists:
    # remap the courses to more broad categories: https://stackoverflow.com/a/16476974
    labels = ['physics', 'chemistry', 'biology', 'algebra', 'geometry', 'statistics', 'calculus', 'history', 'macroeconomics', 'microeconomics']
    for lbl in labels:
        for index, row in khan.iterrows():
            if lbl in row['course'].lower():
                row['course'] = lbl
    khan['course'].value_counts()

In [None]:
if not joined_exists:
    khan.tail(2)

# Dataset Identification
It is necessary to now identify components of the dataset that can be used for the graphical machine learning. This means identifying Nodes, Edges, Node Features, and Labels. It also includes optionally including edge weights and edge features. For the sake of simplicity I think that I will be forgoing the edge weights and edge features.

I am attempting to basically do topic modeling, but without the keywords and topics that would be customary of Latent Dirchlet Allocation. Instead the goal is to train a Graph ML model using the Khan academy data.

There are two basic routes I could take. I could perform node level prediction by treating each transcript individually and seeing which is the closest match during prediction. The other option would be to perform graph level prediction by storing all of the similarly labeled transcripts together and then using the shape of the graph for comparison. 

- Nodes (Items, People, Locations, Cars, ETC)
- Edges (Connections, Interactions, Similarity, ETC)
    - Levenshtein distance over titles?
    - Number of similar named entities from SpaCy NER?
- Node Features (Attributes)
- Labels (Node-Level, Edge-Level, Graph-Level, etc)
    - I am going to first try node-level prediction as it makes more sense to me. For this I am going to use the 'course' feature in the above pandas dataframe. This will be the target that I try to predict.

In [None]:
if not joined_exists:
    # It takes 21 min to run SpaCy preprocessing over each record in the data
    khan['transcript_cleaned'] = khan['transcript'].progress_apply(lambda x: preprocessor.clean(x))
    print(khan['transcript_cleaned'][0])

In [None]:
def str_to_list(s: str) -> list:
    s = s.replace('[', '')
    s = s.replace(']', '')
    s = s.replace('\'', '')
    s = s.split(', ')
    return s

In [None]:
if not joined_exists:
    # clear up the lists to be unique only.
    khan['transcript_cleaned'] = khan['transcript'].progress_apply(lambda x: list(set(str_to_list(x))))
    print(len(khan['transcript_cleaned'][0]))
    print(khan['transcript_cleaned'][0])

In [None]:
if not joined_exists:
# if not False:
    khan['transcript_n_entries'] = khan['transcript_cleaned'].progress_apply(lambda x: len(x))
    khan.head(5)

In [None]:
# khan = khan.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])

In [None]:
if not joined_exists:
# if not False:
    khan.to_csv("khan_joined.csv", index=False)

In [None]:
# https://www.kaggle.com/code/pavansanagapati/knowledge-graph-nlp-tutorial-bert-spacy-nltk/notebook

"""
    nodes closer to yourself are more important. Take the incoming embeddings and dotproduct them with your own embedding.
    Softmax across all the dot products of the nodes coming in to have it be a probability.
    
    Need to build the corpus of all input words.
    
    Use spacy to get corpus for each doc with stopwords stripped and bigrams / trigrams joined. 
    
    Could use skipgram or bag of words with word2vec to embed words.
"""

In [None]:
khan.head(5)

In [None]:
khan['course'].value_counts()

In [None]:
# khan['transcript_cleaned'] = khan['transcript']

In [None]:
corpus = khan['transcript_cleaned'].sum(axis=0)
if isinstance(corpus, str):
    # means it performed string concatenation so it needs to be cleaned up
    print(corpus[0:100])
    corpus = str_to_list(corpus)
    print(corpus[0: 25])
    print(len(corpus))
    print(type(corpus))

In [None]:
# convert the corpus to a set since there should be no unique values
corpus = set(corpus)
print(len(corpus))
corpus = list(corpus)
print(corpus[0:25])

In [None]:
num_topics = len(khan['course'].value_counts())
print(num_topics)

In [None]:
%%time
# LDA Normal
temp = [d.split() for d in corpus]
print(type(temp))
words = corpora.Dictionary(temp)
words.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [words.doc2bow(doc) for doc in temp]

In [None]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=num_topics,
                                           random_state=2,
                                           update_every=1,
                                           passes=15,
                                           alpha='auto')

In [None]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, words, mds='mmds', R=10)

In [None]:
vis

In [None]:
validation.head(5)

In [None]:
new_text_corpus = words.doc2bow(validation['transcript'][0].split())
print(len(new_text_corpus))
prediction = lda_model.get_document_topics(new_text_corpus)
prediction.sort(key = lambda x: x[1], reverse = True)
print(prediction)
prediction = prediction[0][0]
print("Predicted Topic: %d" % prediction)
lda_model.show_topic(3)