# **1. Import Library**

In [24]:
# Standard Libraries
import os
import time
import random
from tqdm import tqdm # type: ignore
import pickle
from collections import Counter

# Seed Setup
import numpy as np
np.random.seed(0)  # Set seed sebelum pengacakan

# Data Manipulation & scrapping
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Pandas Configuration
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.float_format", lambda x: "%.4f" % x)

# Warnings Configuration
import warnings
warnings.filterwarnings("ignore")

In [25]:
# Setup logging configuration
import logging
# Setup logging configuration with emojis and structured output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# **2. Load Dataset**

In [26]:
mainPath =  'C:/Users/rendi/ITTS DATA SCIENCE/Semester 8/Laskar AI - Dicoding x NVIDIA/Capstone Project/Capstone - Course Recommender Systems/' # Path of the main project
dataPath = os.path.join(mainPath, 'Dataset/') # Path of the dataset

In [27]:
# Load the CSV file from the URL
df_courses = pd.read_csv(dataPath + 'udemy_courses_new.csv')
df_interactions = pd.read_csv(dataPath + 'synthetic_user_interactions.csv')
df_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
 12  total_interactions   3672 non-null   int64  
 13  total_users          3672 non-null   int64  
dtypes: bool(1), float64(1), int64(7), object(5)
memory usage: 376.7+ KB


# **3. Scraping Link Banner Courses Here**

In [28]:
# Function to get image URL with error handling, logging, and progress
def get_image_url(course_url, course_index, total_courses):
    start_time = time.time()  # Start timing the request
    
    try:
        response = requests.get(course_url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            img_tag = soup.find('span', class_='intro-asset--img-aspect--3gluH').find('img')
            
            # Calculate the elapsed time
            elapsed_time = time.time() - start_time
            
            # Log successful image retrieval with emoji and elapsed time
            logger.info(f"✅ Course {course_index}/{total_courses}: Image found in {elapsed_time:.2f}s")
            return img_tag['src'] if img_tag else None
        else:
            # Log the error with a warning emoji
            logger.warning(f"⚠️ Course {course_index}/{total_courses}: Failed to retrieve image (HTTP {response.status_code})")
            return None
    
    except Exception as e:
        # Log the error with a red warning emoji
        logger.error(f"❌ Course {course_index}/{total_courses}: Error retrieving image - {e}")
        return None

In [29]:
start_time = time.time()  # Start overall timing
total_courses = len(df_courses)

# Use tqdm for progress bar
tqdm_bar = tqdm(df_courses.iterrows(), total=total_courses, desc="Scraping Courses", unit="course")

image_urls = []

# Loop through all courses and apply the get_image_url function
for idx, row in tqdm_bar:
    course_url = row['url']
    image_url = get_image_url(course_url, idx + 1, total_courses)
    image_urls.append(image_url)
    
    # Update progress bar description with current status
    tqdm_bar.set_postfix({"Course": f"{idx + 1}/{total_courses}"})

# Add the image URLs to the DataFrame
df_courses['image_banner_url'] = image_urls

# Calculate and log the overall execution time
execution_time = time.time() - start_time
logger.info(f"⏱️ Total execution time: {execution_time:.2f}s")

Scraping Courses:   0%|          | 0/3672 [00:00<?, ?course/s]2025-06-03 07:11:36,163 - INFO - ✅ Course 1/3672: Image found in 1.56s
Scraping Courses:   0%|          | 1/3672 [00:01<1:35:36,  1.56s/course, Course=1/3672]2025-06-03 07:11:38,596 - INFO - ✅ Course 2/3672: Image found in 2.43s
Scraping Courses:   0%|          | 2/3672 [00:03<2:06:52,  2.07s/course, Course=2/3672]2025-06-03 07:11:42,191 - INFO - ✅ Course 3/3672: Image found in 3.59s
Scraping Courses:   0%|          | 3/3672 [00:07<2:49:17,  2.77s/course, Course=3/3672]2025-06-03 07:11:43,787 - INFO - ✅ Course 4/3672: Image found in 1.59s
Scraping Courses:   0%|          | 4/3672 [00:09<2:21:00,  2.31s/course, Course=4/3672]2025-06-03 07:11:46,110 - INFO - ✅ Course 5/3672: Image found in 2.32s
Scraping Courses:   0%|          | 5/3672 [00:11<2:21:18,  2.31s/course, Course=5/3672]2025-06-03 07:11:47,326 - ERROR - ❌ Course 6/3672: Error retrieving image - 'NoneType' object has no attribute 'find'
Scraping Courses:   0%|       

In [30]:
# Display the dataframe with the new column
display(df_courses.head())

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,total_interactions,total_users,image_banner_url
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,10,10,https://img-c.udemycdn.com/course/240x135/1070...
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,15,14,https://img-c.udemycdn.com/course/240x135/1113...
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance,11,11,https://img-c.udemycdn.com/course/240x135/1006...
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance,30,30,https://img-c.udemycdn.com/course/240x135/1210...
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance,50,46,https://img-c.udemycdn.com/course/240x135/1011...


In [None]:
# # Fungsi untuk mendapatkan gambar dari URL kursus
# def get_image_url(course_url):
#     try:
#         response = requests.get(course_url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             img_tag = soup.find('span', class_='intro-asset--img-aspect--3gluH').find('img')
#             return img_tag['src']
#         else:
#             return None
#     except Exception as e:
#         print(f"Error retrieving image for {course_url}: {e}")
#         return None

# # Menambahkan kolom "image_banner_url" ke DataFrame dengan iterasi
# df_courses['image_banner_url'] = df_courses['url'].apply(get_image_url)

In [31]:
df_courses[["course_title", "url"]].head()

Unnamed: 0,course_title,url
0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...
1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/
2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...
3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...
4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...


# **4.  Save Scraping Result to Dataframe**

In [32]:
df_courses.to_csv(dataPath + '[Before_Fillna] udemy_courses_with_images.csv', index=False)

# **5. Load df with URL Img**

In [33]:
# Load the CSV file from the URL
df_with_url_img = pd.read_csv(dataPath + '[Before_Fillna] udemy_courses_with_images.csv')
df_with_url_img.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
 12  total_interactions   3672 non-null   int64  
 13  total_users          3672 non-null   int64  
 14  image_banner_url     1588 non-null   object 
dtypes: bool(1), float64(1), int64(7), obje

In [34]:
df_with_url_img["image_banner_url"].isnull().sum()

np.int64(2084)

In [35]:
df_with_url_img["image_banner_url"].fillna("https://raw.githubusercontent.com/LAI25-SM017/NextCourses-RecommenderSystem/refs/heads/main/assets/if_courses_don't_have_banner_image.jpg", inplace=True)

In [36]:
df_with_url_img.to_csv(dataPath + '[After_Fillna] udemy_courses_with_images.csv', index=False)