# **1. Import Library**

In [51]:
# Standard Libraries
import os
import time
import random
from tqdm import tqdm # type: ignore
import pickle
from collections import Counter

# Seed Setup
import numpy as np
np.random.seed(0)  # Set seed sebelum pengacakan

# Data Manipulation & scrapping
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Pandas Configuration
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.float_format", lambda x: "%.4f" % x)

# Warnings Configuration
import warnings
warnings.filterwarnings("ignore")

In [52]:
# Setup logging configuration
import logging
# Setup logging configuration with emojis and structured output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# **2. Load Dataset**

In [53]:
mainPath =  'C:/Users/rendi/ITTS DATA SCIENCE/Semester 8/Laskar AI - Dicoding x NVIDIA/Capstone Project/Capstone - Course Recommender Systems/' # Path of the main project
dataPath = os.path.join(mainPath, 'Dataset/') # Path of the dataset

In [54]:
# Load the CSV file from the URL
df_courses = pd.read_csv(dataPath + 'udemy_courses_new.csv')
df_interactions = pd.read_csv(dataPath + 'synthetic_user_interactions.csv')
df_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
 12  total_interactions   3672 non-null   int64  
 13  total_users          3672 non-null   int64  
dtypes: bool(1), float64(1), int64(7), object(5)
memory usage: 376.7+ KB


In [55]:
links_to_try = (df_courses["url"].sample(4, random_state=42)).tolist()  # Display 10 random URLs from the dataset
links_to_try.extend([
    'https://www.udemy.com/rails-ecommerce-app-with-html-template-from-themeforest/',
    'https://www.udemy.com/introduction-to-json-javascript-object-notation-tutorial/'
]) # Add a specific course URL to the list
display(links_to_try)  # Display the list of URLs to try

['https://www.udemy.com/how-to-play-guitar-really-understand-music/',
 'https://www.udemy.com/wordpress-website-for-beginners/',
 'https://www.udemy.com/the-most-popular-techniques-in-photoshop/',
 'https://www.udemy.com/beginner-blues-guitar/',
 'https://www.udemy.com/rails-ecommerce-app-with-html-template-from-themeforest/',
 'https://www.udemy.com/introduction-to-json-javascript-object-notation-tutorial/']

In [56]:
def get_image_url_with_bs4(course_urls):
    for course_url in course_urls:
        try:
            response = requests.get(course_url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                # img_tag = soup.select_one('span.intro-asset--img-aspect--3gluH img')
                img_tag = soup.find('span', class_='intro-asset--img-aspect--3gluH').find('img')
                if img_tag:
                    print(f"Image URL from {course_url}: {img_tag.get('src')}")
                else:
                    print(f"Image not found in {course_url}")
            else:
                print(f"Failed to retrieve page {course_url}. Status code: {response.status_code}")
        except Exception as e:
            print(f"Error retrieving image from {course_url}: {e}")

# Run test with bs4
get_image_url_with_bs4(links_to_try)

Image URL from https://www.udemy.com/how-to-play-guitar-really-understand-music/: https://img-c.udemycdn.com/course/240x135/26648_a8f3_3.jpg
Error retrieving image from https://www.udemy.com/wordpress-website-for-beginners/: 'NoneType' object has no attribute 'find'
Error retrieving image from https://www.udemy.com/the-most-popular-techniques-in-photoshop/: 'NoneType' object has no attribute 'find'
Image URL from https://www.udemy.com/beginner-blues-guitar/: https://img-c.udemycdn.com/course/240x135/476988_a197_20.jpg
Image URL from https://www.udemy.com/rails-ecommerce-app-with-html-template-from-themeforest/: https://img-c.udemycdn.com/course/240x135/858762_9ebd_3.jpg
Image URL from https://www.udemy.com/introduction-to-json-javascript-object-notation-tutorial/: https://img-c.udemycdn.com/course/240x135/570416_fa18_5.jpg


# **3. Scraping Link Banner Courses Here**

In [57]:
# # Function to get image URL with error handling, logging, and progress
# def get_image_url(course_url, course_index, total_courses):
#     start_time = time.time()  # Start timing the request
    
#     try:
#         response = requests.get(course_url)
        
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             img_tag = soup.find('span', class_='intro-asset--img-aspect--3gluH').find('img')
            
#             # Calculate the elapsed time
#             elapsed_time = time.time() - start_time
            
#             # Log successful image retrieval with emoji and elapsed time
#             logger.info(f"✅ Course {course_index}/{total_courses}: Image found in {elapsed_time:.2f}s")
#             return img_tag['src'] if img_tag else None
#         else:
#             # Log the error with a warning emoji
#             logger.warning(f"⚠️ Course {course_index}/{total_courses}: Failed to retrieve image (HTTP {response.status_code})")
#             return None
    
#     except Exception as e:
#         # Log the error with a red warning emoji
#         logger.error(f"❌ Course {course_index}/{total_courses}: Error retrieving image - {e}")
#         return None

In [58]:
# start_time = time.time()  # Start overall timing
# total_courses = len(df_courses)

# # Use tqdm for progress bar
# tqdm_bar = tqdm(df_courses.iterrows(), total=total_courses, desc="Scraping Courses", unit="course")

# image_urls = []

# # Loop through all courses and apply the get_image_url function
# for idx, row in tqdm_bar:
#     course_url = row['url']
#     image_url = get_image_url(course_url, idx + 1, total_courses)
#     image_urls.append(image_url)
    
#     # Update progress bar description with current status
#     tqdm_bar.set_postfix({"Course": f"{idx + 1}/{total_courses}"})

# # Add the image URLs to the DataFrame
# df_courses['image_banner_url'] = image_urls

# # Calculate and log the overall execution time
# execution_time = time.time() - start_time
# logger.info(f"⏱️ Total execution time: {execution_time:.2f}s")

In [59]:
# Function to get image URL with error handling and timing
def get_image_url(course_url, course_index, total_courses):
    start_time = time.time()  # Start timing the request
    
    try:
        response = requests.get(course_url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            img_tag = soup.find('span', class_='intro-asset--img-aspect--3gluH').find('img')
            
            # Calculate the elapsed time
            elapsed_time = time.time() - start_time
            print(f"Course {course_index}/{total_courses}: Image found in {elapsed_time:.2f}s")
            return img_tag['src'] if img_tag else None
        else:
            print(f"Course {course_index}/{total_courses}: Failed to retrieve image (HTTP {response.status_code})")
            return None
    
    except Exception as e:
        print(f"Course {course_index}/{total_courses}: Error retrieving image - {e}")
        return None

In [60]:
# Start overall timing
start_time = time.time()
total_courses = len(df_courses)

image_urls = []

# Loop through all courses and apply the get_image_url function
for idx, row in df_courses.iterrows():
    course_url = row['url']
    image_url = get_image_url(course_url, idx + 1, total_courses)
    image_urls.append(image_url)
    
    # Monitor and print the current course status
    print(f"Processing: Course {idx + 1}/{total_courses}")

# Add the image URLs to the DataFrame
df_courses['image_banner_url'] = image_urls

# Calculate and print the total execution time
execution_time = time.time() - start_time
print(f"Total execution time: {execution_time:.2f}s")

Course 1/3672: Image found in 2.03s
Processing: Course 1/3672
Course 2/3672: Image found in 2.27s
Processing: Course 2/3672
Course 3/3672: Image found in 3.05s
Processing: Course 3/3672
Course 4/3672: Image found in 1.65s
Processing: Course 4/3672
Course 5/3672: Image found in 2.07s
Processing: Course 5/3672
Course 6/3672: Error retrieving image - 'NoneType' object has no attribute 'find'
Processing: Course 6/3672
Course 7/3672: Image found in 1.96s
Processing: Course 7/3672
Course 8/3672: Image found in 2.06s
Processing: Course 8/3672
Course 9/3672: Error retrieving image - 'NoneType' object has no attribute 'find'
Processing: Course 9/3672
Course 10/3672: Image found in 2.86s
Processing: Course 10/3672
Course 11/3672: Image found in 3.02s
Processing: Course 11/3672
Course 12/3672: Error retrieving image - 'NoneType' object has no attribute 'find'
Processing: Course 12/3672
Course 13/3672: Image found in 2.38s
Processing: Course 13/3672
Course 14/3672: Image found in 1.83s
Processing:

In [61]:
df_courses[df_courses["url"] == "https://www.udemy.com/introduction-to-json-javascript-object-notation-tutorial/"]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,total_interactions,total_users,image_banner_url
2899,570416,JSON Faster Sleeker & Easier Discover the bene...,https://www.udemy.com/introduction-to-json-jav...,True,200,8577,69,31,All Levels,1.5,2015-08-12T18:33:31Z,Web Development,42,42,https://img-c.udemycdn.com/course/240x135/5704...


In [62]:
# Display the dataframe with the new column
display(df_courses.head())

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,total_interactions,total_users,image_banner_url
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,10,10,https://img-c.udemycdn.com/course/240x135/1070...
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,15,14,https://img-c.udemycdn.com/course/240x135/1113...
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance,11,11,https://img-c.udemycdn.com/course/240x135/1006...
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance,30,30,https://img-c.udemycdn.com/course/240x135/1210...
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance,50,46,https://img-c.udemycdn.com/course/240x135/1011...


In [63]:
df_courses[["course_title", "url"]].head()

Unnamed: 0,course_title,url
0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...
1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/
2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...
3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...
4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...


# **4.  Save Scraping Result to Dataframe**

In [64]:
df_courses.to_csv(dataPath + '[Before_Fillna] udemy_courses_with_images.csv', index=False)

# **5. Load df with URL Img**

In [65]:
# Load the CSV file from the URL
df_with_url_img = pd.read_csv(dataPath + '[Before_Fillna] udemy_courses_with_images.csv')
df_with_url_img.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
 12  total_interactions   3672 non-null   int64  
 13  total_users          3672 non-null   int64  
 14  image_banner_url     2062 non-null   object 
dtypes: bool(1), float64(1), int64(7), obje

In [72]:
# links_to_courses_dont_have_banner_image (List of URLs from df_with_url_img)
urls_to_open = df_with_url_img[df_with_url_img['image_banner_url'].isna()]["url"].tolist()
df_with_url_img["image_banner_url"].isnull().sum()

np.int64(1610)

In [79]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import random

# Set up Chrome options (no headless mode, browser will open)
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Remove this line to display the browser

# Set up WebDriver (assuming chromedriver is installed and in your PATH)
driver = webdriver.Chrome(options=chrome_options)

# Open each URL with a delay between them
for idx, url in enumerate(urls_to_open):
    driver.get(url)
    # Print the current URL with its index and total number of URLs
    print(f"Opened {idx + 1}/{len(urls_to_open)} {url}")
    
    # Adding a random delay between 1 and 3 seconds to avoid making too many requests in quick succession
    delay_time = random.uniform(1, 3)  # Random delay between 1 and 3 seconds
    time.sleep(0)

# Close the driver after opening all URLs
# driver.quit()  # Uncomment this line if you want to close the browser at the end

Opened 1/1610 https://www.udemy.com/trading-penny-stocks-a-guide-for-all-levels/
Opened 2/1610 https://www.udemy.com/day-trading-stock-options-3/
Opened 3/1610 https://www.udemy.com/trading-options-using-money-flow/
Opened 4/1610 https://www.udemy.com/hedge-fund-strategy-trading-with-sentiment-analysis/
Opened 5/1610 https://www.udemy.com/learn-basic-technical-analysis/
Opened 6/1610 https://www.udemy.com/7-deadly-mistakes-of-investing/
Opened 7/1610 https://www.udemy.com/intro-to-financial-statements-for-entrepreneurs/
Opened 8/1610 https://www.udemy.com/forexmacross/
Opened 9/1610 https://www.udemy.com/launch-your-amazing-forex-robot-in-30-minutes-no-coding/
Opened 10/1610 https://www.udemy.com/work-from-home-buying-penny-stocks/
Opened 11/1610 https://www.udemy.com/ethereum/
Opened 12/1610 https://www.udemy.com/learn-mql5/
Opened 13/1610 https://www.udemy.com/algorithmic-stock-trading-bootcamp-automate-your-trading/
Opened 14/1610 https://www.udemy.com/the-forex-robot-1000-annual-pr

In [80]:
df_with_url_img["image_banner_url"].fillna("https://raw.githubusercontent.com/Rendika7/NextCourses-RecommenderSystem/refs/heads/main/assets/if_courses_don't_have_banner_image.jpg", inplace=True)

In [81]:
df_with_url_img.to_csv(dataPath + '[After_Fillna] udemy_courses_with_images.csv', index=False)