#### 1.Import Libraries

In [9]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

#### 2.Load Dataset

In [10]:
file_path = r"C:\Users\kavya\Downloads\online_course_recommendation_v2.xlsx"
df = pd.read_excel(file_path)

#### 3.Basic Data Ckecks

In [11]:
print("Shape of dataset:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)
display(df.head())

Shape of dataset: (100000, 14)

Columns:
 ['user_id', 'course_id', 'course_name', 'instructor', 'course_duration_hours', 'certification_offered', 'difficulty_level', 'rating', 'enrollment_numbers', 'course_price', 'feedback_score', 'study_material_available', 'time_spent_hours', 'previous_courses_taken']

Missing values:
 user_id                     0
course_id                   0
course_name                 0
instructor                  0
course_duration_hours       0
certification_offered       0
difficulty_level            0
rating                      0
enrollment_numbers          0
course_price                0
feedback_score              0
study_material_available    0
time_spent_hours            0
previous_courses_taken      0
dtype: int64

Data types:
 user_id                       int64
course_id                     int64
course_name                  object
instructor                   object
course_duration_hours       float64
certification_offered        object
difficulty_le

Unnamed: 0,user_id,course_id,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken
0,15796,9366,Python for Beginners,Emma Harris,39.1,Yes,Beginner,5.0,21600,317.5,0.797,Yes,17.6,4
1,861,1928,Cybersecurity for Professionals,Alexander Young,36.3,Yes,Beginner,4.3,15379,40.99,0.77,Yes,28.97,9
2,38159,9541,DevOps and Continuous Deployment,Dr. Mia Walker,13.4,Yes,Beginner,3.9,6431,380.81,0.772,Yes,52.44,4
3,44733,3708,Project Management Fundamentals,Benjamin Lewis,58.3,Yes,Beginner,3.1,48245,342.8,0.969,No,22.29,6
4,11285,3361,Ethical Hacking Masterclass,Daniel White,30.8,Yes,Beginner,2.8,34556,381.01,0.555,Yes,22.01,5


#### 4. Quick EDA

In [12]:
# Check some statistics
display(df.describe())


Unnamed: 0,user_id,course_id,course_duration_hours,rating,enrollment_numbers,course_price,feedback_score,time_spent_hours,previous_courses_taken
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,24935.66357,5006.86356,52.38354,3.959859,25052.82285,261.073369,0.746865,20.691054,4.99267
std,14406.960108,2882.085456,27.42347,0.729958,14388.188356,139.013758,0.143683,13.669148,2.237259
min,1.0,1.0,5.0,1.0,50.0,20.0,0.092,1.0,0.0
25%,12487.75,2523.0,28.6,3.5,12583.75,140.1,0.649,9.9,3.0
50%,24793.5,5012.0,52.4,4.0,25057.5,262.31,0.75,19.92,5.0
75%,37382.25,7502.0,76.1,4.5,37522.0,381.7025,0.851,30.08,6.0
max,49999.0,9999.0,100.0,5.0,49999.0,500.0,1.0,84.15,19.0


In [13]:
# Check distribution of key categorical columns (if any)
if 'course_name' in df.columns:
    print("\nTop courses:\n", df['course_name'].value_counts().head())


Top courses:
 course_name
Networking and System Administration    5107
Cybersecurity for Professionals         5078
Advanced Machine Learning               5063
Ethical Hacking Masterclass             5062
Graphic Design with Canva               5057
Name: count, dtype: int64


#### 5.Data Preprocessing

In [14]:
# Define numerical and categorical columns
numerical_features = ['course_duration_hours', 'rating', 'enrollment_numbers', 
                      'course_price', 'feedback_score', 'time_spent_hours']
categorical_features = ['certification_offered', 'difficulty_level', 'study_material_available']

# Build preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


#### 6.Build the KNN Model


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

# Example: build X_sparse
# Text feature
vectorizer = TfidfVectorizer()
course_name_vec = vectorizer.fit_transform(df['course_name'])

# Categorical features
ohe = OneHotEncoder()
categorical_features = df[['certification_offered', 'difficulty_level', 'study_material_available']]
categorical_encoded = ohe.fit_transform(categorical_features)

# Combine all features
X_sparse = hstack([course_name_vec, categorical_encoded])

print("✅ X_sparse created successfully!")


✅ X_sparse created successfully!


In [17]:
# Initialize KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(X_sparse)


### 7.Recommendation Function

In [None]:
def recommend_similar_courses_by_name(course_name, n_recommendations=5):
    """
    Recommend similar courses based on course name.
    Shows results in a nice DataFrame with similarity scores.
    """
    # Check if the course exists
    if course_name not in df['course_name'].values:
        print(f"❌ Course '{course_name}' not found!")
        return
    
    # Find the index of the course
    course_index = df[df['course_name'] == course_name].index[0]
    
    # Get nearest neighbors
    distances, indices = knn_model.kneighbors(X_sparse[course_index], n_neighbors=n_recommendations + 1)
    
    # Prepare recommendations
    recommendations = []
    for idx, dist in zip(indices.flatten()[1:], distances.flatten()[1:]):  # Skip itself
        recommendations.append({
            'Course Name': df.iloc[idx]['course_name'],
            'Difficulty Level': df.iloc[idx]['difficulty_level'],
            'Similarity Score': round(1 - dist, 2)  # 1 - cosine distance = similarity
        })
    
    recommendations_df = pd.DataFrame(recommendations)
    
    print(f"\n🎯 Courses similar to '{course_name}':\n")
    return recommendations_df


In [None]:
recommend_similar_courses_by_name('Machine Learning for Beginners')


### 8.Try a Sample Recommendation

In [None]:

# Set seed for reproducibility
random.seed(42)

# Choose a random course
random_course_index = random.choice(range(len(df)))

# Recommend similar courses
recommend_similar_courses(random_course_index)


#### 9. Evaluation: Precision

In [None]:
def precision_at_k(course_index, k=5):
    """
    Precision@k based on matching difficulty level.
    """
    true_level = df.iloc[course_index]['difficulty_level']
    distances, indices = knn_model.kneighbors(X_sparse[course_index], n_neighbors=k+1)
    recommended_indices = indices.flatten()[1:]  # skip self

    relevant = 0
    for idx in recommended_indices:
        if df.iloc[idx]['difficulty_level'] == true_level:
            relevant += 1

    precision = relevant / k
    return precision

# Example evaluation
precision = precision_at_k(random_course_index, k=5)
print(f"\nPrecision@5 for selected course: {precision:.2f}")


#### 10.Save the Model and Pipeline(optional)

In [19]:
# Save the KNN model
joblib.dump(knn_model, 'knn_model.pkl')

# Save the preprocessing pipeline
joblib.dump(pipeline, 'preprocessing_pipeline.pkl')

print("\n✅ Model and pipeline saved successfully!")


✅ Model and pipeline saved successfully!


In [18]:
# ===========================================
# Save models for Content-Based Filtering
# ===========================================

import joblib

# Save your fitted pipeline
joblib.dump(pipeline, 'content_pipeline.pkl')

# Save your trained KNN model
joblib.dump(knn_model, 'content_knn_model.pkl')

print("✅ Content-based models saved successfully!")


✅ Content-based models saved successfully!


#  content-based.ipynb (tune n_neighbors & metric of your KNN model)

In [20]:
# ===========================================
# 🔍 Hyperparameter Tuning: Content-Based KNN
# ===========================================

from sklearn.neighbors import NearestNeighbors
import numpy as np

# 1) Define ranges to search
neighbors_range = [3, 5, 10, 15, 20]
metrics = ['cosine', 'euclidean']

best_score = 0.0
best_params = {}

# 2) Precision@K helper (based on difficulty_level match)
def precision_at_k_cb(course_index, model, k=5):
    distances, indices = model.kneighbors(X_sparse[course_index], n_neighbors=k+1)
    rec_idxs = indices.flatten()[1:]  # skip itself
    true_level = df.iloc[course_index]['difficulty_level']
    rec_levels = [df.iloc[i]['difficulty_level'] for i in rec_idxs]
    return sum(1 for lvl in rec_levels if lvl == true_level) / k

# 3) Sample a set of courses for validation
np.random.seed(42)
sample_indices = np.random.choice(len(df), size=30, replace=False)

# 4) Grid-search
for n in neighbors_range:
    for metric in metrics:
        model = NearestNeighbors(n_neighbors=n, metric=metric, algorithm='brute')
        model.fit(X_sparse)
        
        scores = [precision_at_k_cb(idx, model, k=5) for idx in sample_indices]
        avg_score = np.mean(scores)
        
        if avg_score > best_score:
            best_score = avg_score
            best_params = {'n_neighbors': n, 'metric': metric}

print(f"✅ Best Content-Based KNN params: {best_params} → Precision@5 = {best_score:.2f}")


✅ Best Content-Based KNN params: {'n_neighbors': 3, 'metric': 'cosine'} → Precision@5 = 1.00
