In [42]:
import kagglehub
import time
import math
import spacy
import nltk
import ssl

import pandas as pd
import numpy as np
import networkx as nx

from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import StandardScaler, normalize, OneHotEncoder
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

In [11]:
course = pd.read_csv('Course_info.csv')
course = course[course['language'].isin({'English', 'Indonesian'})].dropna()

In [None]:
def attributes(data, shift='avg_rating'):
  categorical = []
  numerical = []

  for i, cat in enumerate(data.select_dtypes(include = ['object', 'bool']).columns.values):
    categorical.append(cat)
  categorical.append(shift)

  for i, num in enumerate(data.select_dtypes(include = 'number').drop(columns='id').columns.values):
    if num != shift:
      numerical.append(num)

  return categorical, numerical

def outliers_handling(data, features, alpha=0.1):
  outliers_indices = set()

  for col in features:
    upper = data[col].quantile(1-alpha)
    lower = data[col].quantile(alpha)

    outside = data[(data[col] < lower) | (data[col] > upper)]
    outliers_indices.update(outside.index)

  trim = data.drop(index=outliers_indices)
  log_trim = trim.copy()
  log_trim[features] = np.log1p(trim[features])

  return trim, log_trim

def features_type(data):
  return {
      'semantic': ['title', 'headline'],
      'nominal': ['is_paid', 'category', 'subcategory'],
      'datetime': ['published_time', 'last_update_date'],
      'high_cardinal': 'instructor_name',
      'ordinal': 'avg_rating'}

def calc_smoothed_instructor_rating(data, feature, rating='avg_rating', subscriber='num_subscribers', weight=50):
  data['engagement'] = data[rating] * data[subscriber]

  instructor_stats = data.groupby(feature).agg(
      total_rating=('engagement', 'sum'),
      total_subs=(subscriber, 'sum'))

  instructor_stats['weighted_avg'] = instructor_stats['total_rating'] / instructor_stats['total_subs']
  global_avg = data['engagement'].sum() / data[subscriber].sum()
  instructor_stats['smoothed'] = (
      (instructor_stats['total_subs'] * instructor_stats['weighted_avg'] + weight * global_avg) /
      (instructor_stats['total_subs'] + weight))

  data['instructor_score'] = data[feature].map(instructor_stats['smoothed'])
  data[['avg_rating', 'instructor_score']] = data[['avg_rating', 'instructor_score']].astype('int64')

  return data[['avg_rating', 'instructor_score']]

In [13]:
categorical, numerical = attributes(course)
course_clean, course_clean_scaled = outliers_handling(course, numerical)
types = features_type(course)

In [None]:
def semantic_preprocessing(data, features, n_neighbors=10):
  text = data.copy()
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  lemmatizer = WordNetLemmatizer()

  english_features = text[features].apply(lambda col: col.apply(lambda text: text.lower()))

  for col in features:
    english_features[col] = english_features[col].apply(lambda text: nltk.word_tokenize(text))
    english_features[col] = english_features[col].apply(lambda text: [word for word in text if word.lower() not in stop_words])
    # english_features[col] = english_features[col].apply(lambda text: [stemmer.stem(word) for word in text])
    english_features[col] = english_features[col].apply(lambda text: [lemmatizer.lemmatize(word) for word in text if word.isalpha()])
    english_features[col] = english_features[col].apply(lambda text: ' '.join(text))

  combined_text = english_features.apply(lambda row: ' '.join(row), axis=1)

  vectorizer = TfidfVectorizer(max_features=5000, min_df=3, max_df=0.85, ngram_range=(1,2), use_idf=True, smooth_idf=True)
  tfidf_matrix = vectorizer.fit_transform(combined_text)

  knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
  knn.fit(tfidf_matrix)
  distances, indices = knn.kneighbors(tfidf_matrix)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]


def numerical_preprocessing(data, features, n_neighbors=10):
  normalized_data = normalize(data[features])

  knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='euclidean')
  knn.fit(normalized_data)
  distances, indices = knn.kneighbors(normalized_data)
  euclidean_similarities = 1 - distances

  return euclidean_similarities[:, 1:], indices[:, 1:]


def nominal_preprocessing(data, features, n_neighbors=10):
  data = data[features].copy()
  categorical = pd.concat([data['is_paid'].astype('uint8'), 
                           pd.get_dummies(data['category'], prefix='category', dtype='uint8'), 
                           pd.get_dummies(data['subcategory'], prefix='sub_category', dtype='uint8')],
                           axis=1)

  pca_result = PCA(n_components=0.95).fit_transform(categorical)
  knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
  knn.fit(pca_result)
  distances, indices = knn.kneighbors(pca_result)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]


def ordinal_preprocessing(data, n_neighbors=10):
  ordinal_data = data.rank(axis=0, method='average')

  normalized_data = normalize(ordinal_data, norm='l2', axis=1)
  nbrs = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine', n_jobs=-1)
  nbrs.fit(normalized_data)

  distances, indices = nbrs.kneighbors(normalized_data)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]


def datetime_preprocessing(data, features, due='2022-10-10', decay=0.01, weights=None):
  data = data[features].copy()

  for col in features:
    data[col] = pd.to_datetime(data[col]).dt.tz_localize(None)
    col_name = f"{col}_duration"
    data[col_name] = (pd.to_datetime(due) - data[col]).dt.days
    data[col_name] = data[col_name].apply(lambda x: np.exp(-decay * x))

  decay_cols = [f"{col}_duration" for col in features]
  date_matrix = data[decay_cols].values

  if weights:
      for i, col in enumerate(decay_cols):
          date_matrix[:, i] *= weights.get(col, 1.0)

  model = NearestNeighbors(n_neighbors=10, metric='cosine')
  model.fit(date_matrix)
  distances, indices = model.kneighbors(date_matrix)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]

In [32]:
G_semantic, _ = semantic_preprocessing(course_clean, types['semantic'])

In [33]:
G_nominal, _ = nominal_preprocessing(course_clean, types['nominal'])

In [36]:
G_numeric, _ = numerical_preprocessing(course_clean_scaled, features=numerical)

In [34]:
ordinal_mod = calc_smoothed_instructor_rating(course_clean_scaled, types['high_cardinal'])
G_ordinal, _ = ordinal_preprocessing(ordinal_mod)

In [35]:
G_datetime, _ = datetime_preprocessing(course_clean, types['datetime'])