# Course Recommender System - Model Training
This notebook trains models for a course recommender system. It includes an Artificial Neural Network (ANN) for collaborative filtering and generates course embeddings for content-based recommendations. The trained models and data are saved for use in a Streamlit app.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-05-15 18:53:18.944003: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747335199.136538      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747335199.190680      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Load Data
Load your user interaction and course data. Adjust the file paths as needed based on your dataset location.

In [None]:
# Load user interaction data
ratings_file = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/ratings.csv"
user_interactions = pd.read_csv(ratings_file)

# Load course data
courses_file = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
courses = pd.read_csv(courses_file)

# Rename columns to match expected format
user_interactions = user_interactions.rename(columns={'user': 'user_id', 'item': 'course_id', 'rating': 'rating'})
courses = courses.rename(columns={'COURSE_ID': 'course_id', 'TITLE': 'title', 'DESCRIPTION': 'description'})

# Ensure courses DataFrame has genre if needed, or create placeholder column
if 'genre' not in courses.columns:
    courses['genre'] = 'Unknown'

# Ensure all course IDs in user_interactions are in courses DataFrame and vice versa for consistency
interaction_course_ids = set(user_interactions['course_id'].unique())
course_ids = set(courses['course_id'].unique())
missing_in_courses = interaction_course_ids - course_ids
if missing_in_courses:
    print(f"Warning: {len(missing_in_courses)} course IDs from interactions not found in courses data. Adding placeholders.")
    missing_df = pd.DataFrame({
        'course_id': list(missing_in_courses),
        'title': [f'Unknown Course {cid}' for cid in missing_in_courses],
        'description': [f'Description for unknown course {cid}' for cid in missing_in_courses],
        'genre': 'Unknown'
    })
    courses = pd.concat([courses, missing_df], ignore_index=True)

missing_in_interactions = course_ids - interaction_course_ids
if missing_in_interactions:
    print(f"Note: {len(missing_in_interactions)} course IDs from courses data not in interactions. These will be included for content-based recommendations.")

# Update course_ids after adding missing ones
course_ids = set(courses['course_id'].unique())
# Filter user_interactions to only include course IDs present in courses after update
user_interactions = user_interactions[user_interactions['course_id'].isin(course_ids)]

# Display data shapes
print("User Interactions Shape:", user_interactions.shape)
print("Courses Shape:", courses.shape)
user_interactions.head()

Note: 181 course IDs from courses data not in interactions. These will be included for content-based recommendations.
User Interactions Shape: (233306, 3)
Courses Shape: (307, 4)


Unnamed: 0,user_id,course_id,rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0


## Preprocess Data for Collaborative Filtering (ANN)

In [4]:
# Encode user_id and course_id to continuous integers
user_encoder = LabelEncoder()
course_encoder = LabelEncoder()

user_interactions['user_id_encoded'] = user_encoder.fit_transform(user_interactions['user_id'])
user_interactions['course_id_encoded'] = course_encoder.fit_transform(user_interactions['course_id'])

# Prepare features and target
X = user_interactions[['user_id_encoded', 'course_id_encoded']].values
y = user_interactions['rating'].values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Number of unique users and courses
num_users = len(user_encoder.classes_)
num_courses = len(course_encoder.classes_)
embedding_size = 100  # Increased embedding size for potentially better representation

print("Number of Users:", num_users)
print("Number of Courses:", num_courses)

Number of Users: 33901
Number of Courses: 126


## Build and Train ANN Model

In [5]:
# Define the ANN model
def build_ann_model(num_users, num_courses, embedding_size):
    user_input = tf.keras.layers.Input(shape=(1,), name='user_input')
    course_input = tf.keras.layers.Input(shape=(1,), name='course_input')
    
    user_embedding = tf.keras.layers.Embedding(num_users, embedding_size, name='user_embedding')(user_input)
    course_embedding = tf.keras.layers.Embedding(num_courses, embedding_size, name='course_embedding')(course_input)
    
    user_vecs = tf.keras.layers.Flatten()(user_embedding)
    course_vecs = tf.keras.layers.Flatten()(course_embedding)
    
    concat = tf.keras.layers.Concatenate()([user_vecs, course_vecs])
    dense = tf.keras.layers.Dense(128, activation='relu')(concat)
    dense = tf.keras.layers.Dropout(0.2)(dense)
    dense = tf.keras.layers.Dense(64, activation='relu')(dense)
    output = tf.keras.layers.Dense(1, activation='linear')(dense)
    
    model = tf.keras.Model(inputs=[user_input, course_input], outputs=output)
    return model

# Build the model
ann_model = build_ann_model(num_users, num_courses, embedding_size)

# Compile the model
ann_model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

# Summary of the model
ann_model.summary()

I0000 00:00:1747335293.393499      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [6]:
# Train the model
history = ann_model.fit(
    [X_train[:, 0], X_train[:, 1]], y_train,
    validation_data=([X_test[:, 0], X_test[:, 1]], y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

Epoch 1/10


I0000 00:00:1747335301.970255      97 service.cc:148] XLA service 0x78da6400b090 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747335301.970892      97 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1747335302.215644      97 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  71/5833[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12s[0m 2ms/step - loss: 5.3534 - root_mean_squared_error: 2.2738

I0000 00:00:1747335303.257702      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5833/5833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - loss: 0.2547 - root_mean_squared_error: 0.4202 - val_loss: 0.0214 - val_root_mean_squared_error: 0.1461
Epoch 2/10
[1m5833/5833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - loss: 0.0148 - root_mean_squared_error: 0.1212 - val_loss: 0.0100 - val_root_mean_squared_error: 0.1000
Epoch 3/10
[1m5833/5833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - loss: 0.0069 - root_mean_squared_error: 0.0831 - val_loss: 0.0141 - val_root_mean_squared_error: 0.1189
Epoch 4/10
[1m5833/5833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - loss: 0.0064 - root_mean_squared_error: 0.0797 - val_loss: 0.0141 - val_root_mean_squared_error: 0.1188
Epoch 5/10
[1m5833/5833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - loss: 0.0059 - root_mean_squared_error: 0.0770 - val_loss: 0.0170 - val_root_mean_squared_error: 0.1305
Epoch 6/10
[1m5833/5833[0m [32m━━

## Generate Course Embeddings for Content-Based Recommendations

In [None]:
# Use TF-IDF to create embeddings from course descriptions
tfidf = TfidfVectorizer(max_features=100)
course_embeddings = tfidf.fit_transform(courses['description']).toarray()

print("Course Embeddings Shape:", course_embeddings.shape)

Course Embeddings Shape: (307, 100)


## Save Models and Data for Streamlit App

In [8]:
# Create a directory to save models and data
os.makedirs('model_data', exist_ok=True)

# Save the ANN model
ann_model.save('model_data/ann_model.h5')

# Save encoders
with open('model_data/user_encoder.pkl', 'wb') as f:
    pickle.dump(user_encoder, f)
with open('model_data/course_encoder.pkl', 'wb') as f:
    pickle.dump(course_encoder, f)

# Save course embeddings
np.save('model_data/course_embeddings.npy', course_embeddings)

# Save datasets
courses.to_csv('model_data/courses.csv', index=False)
user_interactions.to_csv('model_data/user_interactions.csv', index=False)

print("Models and data saved to 'model_data' directory.")

Models and data saved to 'model_data' directory.
