####  1. Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

#### 2. Load the dataset

In [2]:
file_path = r"C:\Users\kavya\Downloads\online_course_recommendation_v2.xlsx"
df = pd.read_excel(file_path)

####  3. Basic Data Checks

In [3]:
print("Shape of dataset:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)
display(df.head())

Shape of dataset: (100000, 14)

Columns:
 ['user_id', 'course_id', 'course_name', 'instructor', 'course_duration_hours', 'certification_offered', 'difficulty_level', 'rating', 'enrollment_numbers', 'course_price', 'feedback_score', 'study_material_available', 'time_spent_hours', 'previous_courses_taken']

Missing values:
 user_id                     0
course_id                   0
course_name                 0
instructor                  0
course_duration_hours       0
certification_offered       0
difficulty_level            0
rating                      0
enrollment_numbers          0
course_price                0
feedback_score              0
study_material_available    0
time_spent_hours            0
previous_courses_taken      0
dtype: int64

Data types:
 user_id                       int64
course_id                     int64
course_name                  object
instructor                   object
course_duration_hours       float64
certification_offered        object
difficulty_le

Unnamed: 0,user_id,course_id,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken
0,15796,9366,Python for Beginners,Emma Harris,39.1,Yes,Beginner,5.0,21600,317.5,0.797,Yes,17.6,4
1,861,1928,Cybersecurity for Professionals,Alexander Young,36.3,Yes,Beginner,4.3,15379,40.99,0.77,Yes,28.97,9
2,38159,9541,DevOps and Continuous Deployment,Dr. Mia Walker,13.4,Yes,Beginner,3.9,6431,380.81,0.772,Yes,52.44,4
3,44733,3708,Project Management Fundamentals,Benjamin Lewis,58.3,Yes,Beginner,3.1,48245,342.8,0.969,No,22.29,6
4,11285,3361,Ethical Hacking Masterclass,Daniel White,30.8,Yes,Beginner,2.8,34556,381.01,0.555,Yes,22.01,5


#### 4. Quick EDA

In [4]:
# Check some statistics
display(df.describe())


Unnamed: 0,user_id,course_id,course_duration_hours,rating,enrollment_numbers,course_price,feedback_score,time_spent_hours,previous_courses_taken
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,24935.66357,5006.86356,52.38354,3.959859,25052.82285,261.073369,0.746865,20.691054,4.99267
std,14406.960108,2882.085456,27.42347,0.729958,14388.188356,139.013758,0.143683,13.669148,2.237259
min,1.0,1.0,5.0,1.0,50.0,20.0,0.092,1.0,0.0
25%,12487.75,2523.0,28.6,3.5,12583.75,140.1,0.649,9.9,3.0
50%,24793.5,5012.0,52.4,4.0,25057.5,262.31,0.75,19.92,5.0
75%,37382.25,7502.0,76.1,4.5,37522.0,381.7025,0.851,30.08,6.0
max,49999.0,9999.0,100.0,5.0,49999.0,500.0,1.0,84.15,19.0


In [5]:
# Check distribution of key categorical columns (if any)
if 'course_name' in df.columns:
    print("\nTop courses:\n", df['course_name'].value_counts().head())


Top courses:
 course_name
Networking and System Administration    5107
Cybersecurity for Professionals         5078
Advanced Machine Learning               5063
Ethical Hacking Masterclass             5062
Graphic Design with Canva               5057
Name: count, dtype: int64


### 5. Data Preprocessing

In [6]:

# Define correct columns
numerical_features = ['rating', 'course_duration_hours', 'enrollment_numbers', 'course_price', 'feedback_score', 'time_spent_hours']
categorical_features = ['certification_offered', 'difficulty_level', 'study_material_available']

# Build preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

##  6. Build the Collaborative Filtering Model

In [7]:
# Build pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Preprocess features
X = pipeline.fit_transform(df)

# Convert to sparse matrix
X_sparse = csr_matrix(X)

# Build KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
knn_model.fit(X_sparse)


###  7. Function to Get Recommendations

In [8]:
def recommend_courses(course_index, n_recommendations=5):
    distances, indices = knn_model.kneighbors(X_sparse[course_index], n_neighbors=n_recommendations+1)
    print("\nRecommendations for:", df.iloc[course_index]['course_name'])
    print("--------------------------------------------------")
    recommended_indices = indices.flatten()[1:]  # skip first as it will be the same course
    for idx in recommended_indices:
        print(df.iloc[idx]['course_name'])

# Example: Get recommendations for a random course
sample_course_index = 10
recommend_courses(sample_course_index)



Recommendations for: Graphic Design with Canva
--------------------------------------------------
Project Management Fundamentals
Fitness and Nutrition Coaching
Mobile App Development with Swift
DevOps and Continuous Deployment
Graphic Design with Canva


### 8. Save the Model (Optional)

In [9]:

joblib.dump(knn_model, 'course_recommendation_model.pkl')
joblib.dump(pipeline, 'preprocessing_pipeline.pkl')

print("\nModel and preprocessing pipeline saved successfully!")


Model and preprocessing pipeline saved successfully!


### 9. Evaluation: Precision based on difficulty_level similarity

In [10]:
import random

def precision_at_k(course_index, k=5):
    """
    Calculate Precision@k for a given course index based on matching difficulty_level.
    """
    # Get true difficulty level of the selected course
    true_level = df.iloc[course_index]['difficulty_level']
    
    # Get recommendations
    distances, indices = knn_model.kneighbors(X_sparse[course_index], n_neighbors=k+1)
    recommended_indices = indices.flatten()[1:]  # skip self

    # Count how many recommended courses have same difficulty_level
    relevant = 0
    for idx in recommended_indices:
        recommended_level = df.iloc[idx]['difficulty_level']
        if recommended_level == true_level:
            relevant += 1

    precision = relevant / k
    return precision

# Evaluate on 10 random courses
sample_indices = random.sample(range(len(df)), 10)
precisions = []

for idx in sample_indices:
    p_at_5 = precision_at_k(idx, k=5)
    precisions.append(p_at_5)

# Final Average Precision@5
avg_precision_at_5 = np.mean(precisions)
print(f"\nAverage Precision@5 over 10 random courses: {avg_precision_at_5:.2f}")



Average Precision@5 over 10 random courses: 1.00


In [12]:
# ===========================================
# 1. Build Collaborative Filtering Matrices
# ===========================================

import pandas as pd
from scipy.sparse import csr_matrix



# Create user-item rating matrix
user_item_matrix = pd.pivot_table(
    df,
    values='rating',
    index='user_id',
    columns='course_name'
).fillna(0)

# Create sparse version
user_item_sparse = csr_matrix(user_item_matrix.values)

print("✅ Collaborative filtering matrices created!")


✅ Collaborative filtering matrices created!


In [13]:

# ===========================================
# Save models for Collaborative Filtering
# ===========================================

import joblib

# Save your user-item matrix
joblib.dump(user_item_sparse, 'user_item_sparse.pkl')

# (Optional) Save user_item_matrix DataFrame too
joblib.dump(user_item_matrix, 'user_item_matrix.pkl')

print("✅ Collaborative filtering models saved successfully!")


✅ Collaborative filtering models saved successfully!
