In [41]:
import spacy
import nltk
import pickle

import pandas as pd
import numpy as np
import networkx as nx

from sklearn.preprocessing import normalize, StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from scipy.sparse import csr_matrix

In [42]:
course = pd.read_csv('Course_info.csv')
course = course[course['language'].isin({'English', 'Indonesian'})].dropna()

In [None]:
#Trimming Outliers

'''
def outliers_handling(data, features, alpha=0.1):
  outliers_indices = set()

  for col in features:
    upper = data[col].quantile(1-alpha)
    lower = data[col].quantile(alpha)

    outside = data[(data[col] < lower) | (data[col] > upper)]
    outliers_indices.update(outside.index)

  trim = data.drop(index=outliers_indices)
  log_trim = trim.copy()
  log_trim[features] = np.log1p(trim[features])

  return trim, log_trim
'''

In [44]:
def attributes(data, shift='avg_rating'):
  categorical = []
  numerical = []

  for i, cat in enumerate(data.select_dtypes(include = ['object', 'bool']).columns.values):
    categorical.append(cat)
  categorical.append(shift)

  for i, num in enumerate(data.select_dtypes(include = 'number').drop(columns='id').columns.values):
    if num != shift:
      numerical.append(num)

  return categorical, numerical


def data_cleaning(data, features, par=0.9):
  outliers_indices = set()

  for col in features:
    exclude = data[col].quantile(par)
    outliers = data[data[col] > exclude]
    outliers_indices.update(outliers.index)
    
  trim = data.drop(index=outliers_indices)
  
  pt = PowerTransformer(method='yeo-johnson')
  transformed = trim.copy()
  transformed[features] = pt.fit_transform(transformed[features])

  return trim, transformed


def features_type(data):
  return {
      'semantic': ['title', 'headline'],
      'nominal': ['is_paid', 'category', 'subcategory'],
      'datetime': ['published_time', 'last_update_date'],
      'high_cardinal': 'instructor_name',
      'ordinal': 'avg_rating'}


def calc_smoothed_instructor_rating(data, feature, rating='avg_rating', subscriber='num_subscribers', weight=50):
  data['engagement'] = data[rating] * data[subscriber]

  instructor_stats = data.groupby(feature).agg(
      total_rating=('engagement', 'sum'),
      total_subs=(subscriber, 'sum'))

  instructor_stats['weighted_avg'] = instructor_stats['total_rating'] / instructor_stats['total_subs']
  global_avg = data['engagement'].sum() / data[subscriber].sum()
  instructor_stats['smoothed'] = (
      (instructor_stats['total_subs'] * instructor_stats['weighted_avg'] + weight * global_avg) /
      (instructor_stats['total_subs'] + weight))

  data['instructor_score'] = data[feature].map(instructor_stats['smoothed'])
  data.loc[(data[rating] == 0) | (data[subscriber] == 0), 'instructor_score'] = 0
  data[['avg_rating', 'instructor_score']] = data[['avg_rating', 'instructor_score']].astype('int64')

  return data[['avg_rating', 'instructor_score']]

In [45]:
categorical, numerical = attributes(course)
course_clean, course_clean_scaled = data_cleaning(course, numerical)
types = features_type(course_clean)
ordinal_mod = calc_smoothed_instructor_rating(course_clean, types['high_cardinal'])

In [46]:
def semantic_preprocessing(data, features, n_neighbors=10):
  text = data.copy()
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  lemmatizer = WordNetLemmatizer()

  english_features = text[features].apply(lambda col: col.apply(lambda text: text.lower()))

  for col in features:
    english_features[col] = english_features[col].apply(lambda text: nltk.word_tokenize(text))
    english_features[col] = english_features[col].apply(lambda text: [word for word in text if word.lower() not in stop_words])
    english_features[col] = english_features[col].apply(lambda text: [stemmer.stem(word) for word in text])
    english_features[col] = english_features[col].apply(lambda text: ' '.join(text))

  combined_text = english_features.apply(lambda row: ' '.join(row), axis=1)
  vectorizer = TfidfVectorizer(max_features=5000, min_df=3, max_df=0.85, ngram_range=(1,2), use_idf=True, smooth_idf=True)
  tfidf_matrix = vectorizer.fit_transform(combined_text)

  knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
  knn.fit(tfidf_matrix)
  distances, indices = knn.kneighbors(tfidf_matrix)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]


def numerical_preprocessing(data, features, n_neighbors=10):
  normalized_data = normalize(data[features])

  knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='euclidean')
  knn.fit(normalized_data)
  distances, indices = knn.kneighbors(normalized_data)
  euclidean_similarities = 1 - distances

  return euclidean_similarities[:, 1:], indices[:, 1:]


def nominal_preprocessing(data, features, n_neighbors=10):
  data = data[features].copy()
  categorical = pd.concat([data['is_paid'].astype('uint8'), 
                           pd.get_dummies(data['category'], prefix='category', dtype='uint8'), 
                           pd.get_dummies(data['subcategory'], prefix='sub_category', dtype='uint8')],
                           axis=1)

  pca_result = PCA(n_components=0.95).fit_transform(categorical)
  knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
  knn.fit(pca_result)
  distances, indices = knn.kneighbors(pca_result)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]


def ordinal_preprocessing(data, n_neighbors=10):
  ordinal_data = data.rank(axis=0, method='average')

  normalized_data = normalize(ordinal_data, norm='l2', axis=1)
  nbrs = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
  nbrs.fit(normalized_data)

  distances, indices = nbrs.kneighbors(normalized_data)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]


def datetime_preprocessing(data, features, due='2022-10-10', decay=0.01, weights=None, n_neighbors=10):
  data = data[features].copy()

  for col in features:
    data[col] = pd.to_datetime(data[col]).dt.tz_localize(None)
    col_name = f"{col}_duration"
    data[col_name] = (pd.to_datetime(due) - data[col]).dt.days
    data[col_name] = data[col_name].apply(lambda x: np.exp(-decay * x))

  decay_cols = [f"{col}_duration" for col in features]
  date_matrix = data[decay_cols].values

  if weights:
      for i, col in enumerate(decay_cols):
          date_matrix[:, i] *= weights.get(col, 1.0)

  model = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
  model.fit(date_matrix)
  distances, indices = model.kneighbors(date_matrix)
  cosine_similarities = 1 - distances

  return cosine_similarities[:, 1:], indices[:, 1:]

In [47]:
G_semantic, i_semantic = semantic_preprocessing(course_clean, types['semantic'])
G_nominal, i_nominal = nominal_preprocessing(course_clean, types['nominal'])
G_numeric, i_numeric = numerical_preprocessing(course_clean_scaled, features=numerical)
G_ordinal, i_ordinal = ordinal_preprocessing(ordinal_mod)
G_datetime, i_datetime= datetime_preprocessing(course_clean, types['datetime'])

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump({
        'G_semantic': G_semantic,
        'i_semantic': i_semantic,
        'G_nominal': G_nominal,
        'i_nominal': i_nominal,
        'G_numeric': G_numeric,
        'i_numeric': i_numeric,
        'ordinal_mod': ordinal_mod,
        'G_ordinal': G_ordinal,
        'i_ordinal': i_ordinal,
        'G_datetime': G_datetime,
        'i_datetime': i_datetime
    }, f)

In [56]:
with open('preprocessed_data.pkl', 'rb') as f:
    data = pickle.load(f)

#similarities
G_semantic = data['G_semantic']
G_nominal = data['G_nominal']
G_numeric = data['G_numeric']
G_ordinal = data['G_ordinal']
G_datetime = data['G_datetime']

#indices 
i_semantic = data['i_semantic']
i_nominal = data['i_nominal']
i_numeric = data['i_numeric']
i_ordinal = data['i_ordinal']
i_datetime = data['i_datetime']

In [59]:
all_features = {
    'semantic': (G_semantic, i_semantic),
    'nominal': (G_nominal, i_nominal),
    'ordinal': (G_numeric, i_numeric),
    'datetime': (G_ordinal, i_ordinal),
    'numeric': (G_datetime, i_datetime)
}

weights = {
    'semantic': 0.4,
    'nominal': 0.3,
    'ordinal': 0.15,
    'datetime': 0.10,
    'numeric': 0.05
}

In [50]:
def aggregate_similarities(all_features, weights, num_items=len(course_clean), n_neighbors=10):
    combined_scores = {}

    for feature_type, (sim_matrix, idx_matrix) in all_features.items():
        weight = weights.get(feature_type, 0)

        for i in range(num_items):
            for j in range(idx_matrix.shape[1]):
                neighbor_idx = idx_matrix[i, j]
                similarity_score = sim_matrix[i, j]
                
                key1 = (min(i, neighbor_idx), max(i, neighbor_idx))
                
                combined_scores[key1] = combined_scores.get(key1, 0.0) + (similarity_score * weight)

    rows, cols, data = [], [], []
    for (r, c), val in combined_scores.items():
        rows.append(r)
        cols.append(c)
        data.append(val)

        if r != c:
            rows.append(c)
            cols.append(r)
            data.append(val)

    final_similarity_matrix = csr_matrix((data, (rows, cols)), shape=(num_items, num_items))

    return final_similarity_matrix


final_sim_matrix = aggregate_similarities(all_features, weights)

print(f"Shape of final similarity matrix: {final_sim_matrix.shape}")
print(f"Number of non-zero elements: {final_sim_matrix.nnz}")

with open('sparse_matrix.pkl', 'wb') as f:
    pickle.dump(final_sim_matrix, f)

Shape of final similarity matrix: (91566, 91566)
Number of non-zero elements: 7939306


In [51]:
def build_item_similarity_graph(similarity_matrix, threshold=0.5, top_n_edges_per_node=30):
    G = nx.Graph()

    for i, item_id in enumerate(course_clean['id']):
        G.add_node(i, item_id=item_id)

    num_items = similarity_matrix.shape[0]

    for i in range(num_items):
        row_sims = similarity_matrix.getrow(i).toarray().flatten()
        candidate_indices = np.where(row_sims > 0)[0] 
        candidate_indices = candidate_indices[candidate_indices != i] 
        
        if len(candidate_indices) == 0:
            continue

        sorted_indices = candidate_indices[np.argsort(-row_sims[candidate_indices])]
        
        if top_n_edges_per_node is not None:
            edges_to_add = sorted_indices[:top_n_edges_per_node]
        else:
            edges_to_add = sorted_indices 

        for j in edges_to_add:
            sim_score = row_sims[j]
            if sim_score > threshold:
                if not G.has_edge(i, j):
                    G.add_edge(i, j, weight=sim_score)
    
    return G

In [60]:
G = build_item_similarity_graph(final_sim_matrix)

print(f"Number of nodes in graph: {G.number_of_nodes()}")
print(f"Number of edges in graph: {G.number_of_edges()}")

with open('network_resource.pkl', 'wb') as g:
     pickle.dump(G, g)

Number of nodes in graph: 91566
Number of edges in graph: 103668


In [None]:
#basic tracing declarations
def numerical_tracing(data, features):
  normalized_data = normalize(data[features])
  numerical_df = pd.DataFrame(normalized_data)

  return numerical_df.describe().T

def nominal_tracing(data, features):
  categorical = pd.concat([data['is_paid'].astype('uint8'), 
                           pd.get_dummies(data['category'], prefix='category', dtype='uint8'), 
                           pd.get_dummies(data['subcategory'], prefix='sub_category', dtype='uint8')],
                           axis=1)

  pca_result = PCA(n_components=0.95).fit_transform(categorical)
  pca_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])

  return pca_df.describe().T


def ordinal_tracing(data):
  ordinal_data = data.rank(axis=0, method='average')
  normalized_data = normalize(ordinal_data, norm='l2', axis=1)
  ordinal_df = pd.DataFrame(normalized_data)

  return ordinal_df.describe().T


def datetime_tracing(data, features, due='2022-10-10', decay=0.01, weights=None):
  for col in features:
    data[col] = pd.to_datetime(data[col]).dt.tz_localize(None)
    col_name = f"{col}_duration"
    data[col_name] = (pd.to_datetime(due) - data[col]).dt.days
    data[col_name] = data[col_name].apply(lambda x: np.exp(-decay * x))
  decay_cols = [f"{col}_duration" for col in features]

  return data[decay_cols].describe().T


#basic tracing executions
def tracing_all():
    ordinal_mod = calc_smoothed_instructor_rating(course_clean_scaled, types['high_cardinal'])
    ordinal = ordinal_tracing(ordinal_mod)
    print('Ordinal Data')
    print(ordinal.to_string())

    numerics = numerical_tracing(course_clean_scaled, numerical)
    print('\nNumerical Data')
    print(numerics.to_string())

    nominal = nominal_tracing(course_clean_scaled, types['nominal'])
    print('\nNominal Data')
    print(nominal.to_string())

    datetime = datetime_tracing(course_clean_scaled, types['datetime'])
    print('\nDatetime Data')
    print(datetime.to_string())  

tracing_all()

In [55]:
#similarity tracing
def similarity_stats(sim_matrix):
    flat_sim = sim_matrix.flatten()
    stats = {
        "Min": np.min(flat_sim),
        "Max": np.max(flat_sim),
        "Mean": np.mean(flat_sim),
        "Std": np.std(flat_sim),
        "25%": np.percentile(flat_sim, 25),
        "50% (Median)": np.percentile(flat_sim, 50),
        "75%": np.percentile(flat_sim, 75)}
    
    return stats

def similarity_tracing():
    results = {}
    results['semantic'] = similarity_stats(G_semantic)
    results['numeric'] = similarity_stats(G_numeric)
    results['nominal'] = similarity_stats(G_nominal)
    results['ordinal'] = similarity_stats(G_ordinal)
    results['datetime'] = similarity_stats(G_datetime)

    df_stats = pd.DataFrame(results).T  # Transpose for readability
    print("\nSimilarity Statistics Summary:")
    
    return df_stats

similarity_tracing()


Similarity Statistics Summary:


Unnamed: 0,Min,Max,Mean,Std,25%,50% (Median),75%
semantic,0.0,1.0,0.545709,0.137603,0.443216,0.529759,0.632516
numeric,0.3424422,1.0,0.880258,0.067472,0.842815,0.889016,0.925392
nominal,0.8122366,1.0,0.999994,0.001075,1.0,1.0,1.0
ordinal,0.9978384,1.0,1.0,1.2e-05,1.0,1.0,1.0
datetime,2.220446e-16,1.0,0.998198,0.042411,1.0,1.0,1.0
