In [1]:
import pandas as pd
import json
import pandas_profiling
import os
from collections import defaultdict
os.chdir('../../')

In [None]:
def generate_final_report(df, version='1.0'):
    processed_report = pandas_profiling.ProfileReport(df)
    processed_report.to_file("reports/"+version+"-courses.html")

In [9]:
FILE_PATH = "data/raw/courses_new.json"
data = json.loads(open(FILE_PATH).read())
heading = ("title", "instructors", "text", "mooc_list_url",
           "start_date", "university_url", "university", "provider",
           "language", "subtitles", "short_description",
           "average_rating", "votes", "categories", "tags")
dict_data = defaultdict(list)
for i, course in enumerate(data):
    for head in heading:
        dict_data[head].append(course[head])
dataframe = pd.DataFrame(data=dict_data)
dataframe.head()

Unnamed: 0,average_rating,categories,instructors,language,mooc_list_url,provider,short_description,start_date,subtitles,tags,text,title,university,university_url,votes
0,4.0,Sci: Chemistry,Allison Soult; Kim Woodrum,English,https://www.mooc-list.com/course/chemistry-1-e...,Canvas Network,Many students leave high school unprepared for...,Jun 20th 2016,,Chemistry; Chemical Elements; Compounds; Atomi...,Instructors Allison Soult and Kim Woodrum brin...,Chemistry 1: Elements and Compounds and their...,University of Kentucky,https://www.mooc-list.com/university-entity/un...,3
1,10.0,"Art, Architecture & Design",Corey D'Augustine,English,https://www.mooc-list.com/course/studio-postwa...,Coursera,Want to know how some of the 20th century’s mo...,Jan 29th 2018,,Art; Abstract Painting; Willem de Kooning; Yay...,Through studio demonstrations and gallery walk...,In the Studio: Postwar Abstract Painting (Cour...,The Museum of Modern Art,https://www.mooc-list.com/university-entity/mu...,1
2,10.0,Humanities; Social Sciences,Alyssa Goodman,English,https://www.mooc-list.com/course/predictionx-d...,EdX,This course is an overview of divination syste...,Self Paced,English,Prediction; Prediction Systems; Culture; Histo...,"Join us for this one-week, immersive learning ...",PredictionX: Diviner’s Guide (edX),Harvard University,https://www.mooc-list.com/university-entity/ha...,1
3,8.3,"Art, Architecture & Design; Humanities",Beverly Mayne Kienzle,English,https://www.mooc-list.com/course/monasteries-s...,EdX,Get an introduction to European handwriting of...,Self Paced,English,Monasteries; Gothic Script; Manuscripts; Trans...,Thousands upon thousands of manuscripts writte...,"Monasteries, Schools, and Notaries, Part 2: In...",Harvard University,https://www.mooc-list.com/university-entity/ha...,3
4,10.0,Education; Teacher Professional Development,Various Instructors,English,https://www.mooc-list.com/course/design-and-de...,Canvas Network,This course introduces Blended Learning to the...,Mar 23rd 2015,,Blended Learning; Education; Learning,There are 4 primary learning units of about 30...,Design and Deliver Blended Learning (Canvas net),Institute for Adult Learning,https://www.mooc-list.com/university-entity/in...,1


In [10]:
dedup_dataframe = dataframe.drop_duplicates()
# generate_final_report(dedup_dataframe)

In [11]:
english_dataframe = dedup_dataframe[dedup_dataframe.language == 'English']
# generate_final_report(english_dataframe)

In [12]:
s = english_dataframe.categories.str.split('; ', expand=True).stack()
i = s.index.get_level_values(0)
unrolled_categories_dataframe = english_dataframe.loc[i].copy()
unrolled_categories_dataframe.categories = s.values
unrolled_categories_dataframe.categories.unique()

array(['Sci: Chemistry', 'Art, Architecture & Design', 'Humanities',
       'Social Sciences', 'Education', 'Teacher Professional Development',
       'CS: Software Engineering', 'Engineering', 'Languages & Literature',
       'Management & Leadership', 'CS: Systems, Security & Networking',
       'CS: Artificial Intelligence, Robotics & Computer Vision',
       'Personal and Professional Development', 'Health & Society',
       'Business', 'Statistics & Data Analysis', 'CS: Programming',
       'Music, Film & Audio', 'Sci: Mathematics',
       'Sci: Biology & Life Sciences', 'Food & Nutrition', 'Law',
       'Sci: Energy & Earth Sciences', 'Sci: Environment',
       'CS: Information & Technology', 'Medicine & Pharmacology',
       'Sci: Physics', 'Sports', 'Marketing & Communication',
       'Sci: Physical & Earth Sciences', 'Economics & Finance',
       'CS: Theory', 'Art and Culture', 'History', 'Science',
       'Data Science', 'CS: Design & Product', 'Eng: Electronics',
       'Co

In [None]:
generate_final_report(unrolled_categories_dataframe)

In [None]:
grouped_dataframe = unrolled_categories_dataframe.groupby('categories')

In [None]:
grouped_dataframe.apply(lambda x: x.to_json(orient='records')).to_json('data/processed/grouped_courses.json')

### Course embeddings

In [2]:
from src.models.courses.recommender import Recommender
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter

from matplotlib.pyplot import cm
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
# %matplotlib notebook

In [3]:
w2v_path = "/Users/roman/Word2Vec/GoogleNews-vectors-negative300.bin"
recommender = Recommender(w2v_path)

In [4]:
def plot_embedding(X, id2word, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], id2word[i],
                 fontdict={'weight': 'bold', 'size': 5})
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

In [33]:
category_course_ids = recommender.category2courses[recommender.category2id['Computer Science']]
category_course_embeddings = recommender.course_embeddings[category_course_ids]
tsne = TSNE(n_components=3, init='pca', random_state=0, perplexity=10)
X_tsne = tsne.fit_transform(category_course_embeddings)

In [34]:
%matplotlib tk
course_names = [recommender.id2course[c].title.split('(')[0] for c in category_course_ids]
# plot_embedding(X_tsne[1000:1100], course_names, "T-SNE data")
n = len(course_names)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
colors = cm.Spectral(np.linspace(0, 1, n))

xx = X_tsne[:n, 0]
yy = X_tsne[:n, 1]
zz = X_tsne[:n, 2]

# plot the 3D data points
ax.scatter(xx, yy, zz, color=colors, label=course_names, s=10)
for i in range(n):
    ax.text(xx[i], yy[i], zz[i], course_names[i], size=8, zorder=1, color='k')

ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
ax.zaxis.set_major_formatter(NullFormatter())

ax.grid(False)

# Hide axes ticks
ax.set_xticks([])
ax.set_yticks([])
ax.set_zticks([])

ax.axis('off')

plt.axis('tight')
plt.legend(loc='best', scatterpoints=1, fontsize=5)
plt.savefig('reports/figures/tsne_courses.pdf', format='pdf', dpi=600)
plt.show()

In [45]:
category_course_ids = recommender.category2courses[recommender.category2id['Computer Science']]
category_course_embeddings = recommender.course_embeddings[category_course_ids]
tsne = TSNE(n_components=2, init='pca', random_state=0, perplexity=5)
X_tsne = tsne.fit_transform(category_course_embeddings)

In [46]:
%matplotlib tk
course_names = [recommender.id2course[c].title.split('(')[0] for c in category_course_ids]
# plot_embedding(X_tsne[1000:1100], course_names, "T-SNE data")
n = len(course_names)
fig = plt.figure()
ax = fig.add_subplot(111)
colors = cm.Spectral(np.linspace(0, 1, n))

xx = X_tsne[:n, 0]
yy = X_tsne[:n, 1]

# plot the 3D data points
ax.scatter(xx, yy, color=colors, label=course_names, s=10)
for i in range(n):
    ax.text(xx[i], yy[i], course_names[i], size=8, zorder=1, color='k')

ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())


ax.grid(False)

# Hide axes ticks
ax.set_xticks([])
ax.set_yticks([])

ax.axis('off')

plt.axis('tight')
plt.legend(loc='best', scatterpoints=1, fontsize=5)
plt.savefig('reports/figures/tsne_courses.pdf', format='pdf', dpi=600)
plt.show()