# YouTube Data Collection with Selenium

This notebook collects comments related to **iPhone 17** using Selenium automation.
The output is stored as a raw CSV file for further NLP analysis.


In [2]:
# Selenium core modules for browser automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# Chrome browser configuration
from selenium.webdriver.chrome.options import Options

# Tools for waiting until elements are loaded
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Exception handling
from selenium.common.exceptions import NoSuchElementException

# Data processing
import pandas as pd

# Time handling
import time
from datetime import datetime


In [3]:
# Chrome browser options
options = Options()
options.add_argument("--start-maximized")  # Open browser in full screen
options.add_argument("--disable-blink-features=AutomationControlled")  # Reduce bot detection

# Initialize Chrome driver with options
driver = webdriver.Chrome(options=options)

try:
    # Open YouTube video page
    driver.get("https://www.youtube.com/watch?v=ZUydETmYCgw")

    # Wait until the video title is loaded
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "h1.title"))
    )

    print("YouTube video page loaded successfully.")

except Exception as e:
    # Handle any loading or network errors
    print("Error loading YouTube video:", e)



YouTube video page loaded successfully.


In [4]:
# Prompt user to log in manually for security reasons
print("Please log in to YouTUbe manually.")

try:
    # Wait until YOUtube feed is loaded after login
    WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.ID, "global-nav-search"))
    )
    print("Login successful, YouTube feed is ready.")

except Exception as e:
    # Handle login timeout or page structure changes
    print("Login timeout or error:", e)



Please log in to LinkedIn manually.
Login timeout or error: Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7454c8895
	0x7ff7454c88f0
	0x7ff7452a165d
	0x7ff7452f9a33
	0x7ff7452f9d3c
	0x7ff74534df67
	0x7ff74534ac97
	0x7ff7452eac29
	0x7ff7452eba93
	0x7ff7457e05f0
	0x7ff7457daf30
	0x7ff7457f9696
	0x7ff7454e5d94
	0x7ff7454eed3c
	0x7ff7454d1fb4
	0x7ff7454d2165
	0x7ff7454b7e92
	0x7ffacf32e8d7
	0x7ffad010c53c



In [5]:
# Open YouTube video page
driver.get("https://www.youtube.com/watch?v=ZUydETmYCgw")

try:
    # Wait until the video title is present in the DOM
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "h1.title"))
    )
    print("YouTube video page loaded successfully.")

except Exception as e:
    # Handle page loading errors
    print("Error loading YouTube video:", e)


YouTube video page loaded successfully.


In [6]:
# Use a set to avoid duplicate comments
comments = set()

# Scroll page multiple times to load more comments
for i in range(15):
    driver.execute_script("window.scrollBy(0, 1000);")
    time.sleep(3)

    try:
        # Locate all visible comment texts
        elements = driver.find_elements(By.CSS_SELECTOR, "#content-text")
        before_count = len(comments)

        for el in elements:
            text = el.text.strip()
            if len(text) > 10:  # Filter very short comments
                comments.add(text)

        after_count = len(comments)
        print(f"Iteration {i+1}: Added {after_count - before_count} new comments")

    except Exception as e:
        print(f"Error in iteration {i+1}: {e}")

print(f"Total collected comments: {len(comments)}")

# Convert to DataFrame
df = pd.DataFrame(list(comments), columns=["comment"])
df["date"] = datetime.now().date()

# Filter comments related to iPhone 17
df_filtered = df[df["comment"].str.contains(
    r"(iphone\s*17|آیفون\s*17)",
    case=False,
    regex=True
)]

# Save final dataset
df_filtered.to_csv("youtube_iphone17_comments.csv", index=False)




Iteration 1: Added 20 new comments
Iteration 2: Added 0 new comments
Iteration 3: Added 20 new comments
Iteration 4: Added 0 new comments
Iteration 5: Added 20 new comments
Iteration 6: Added 0 new comments
Iteration 7: Added 0 new comments
Iteration 8: Added 14 new comments
Iteration 9: Added 0 new comments
Iteration 10: Added 0 new comments
Iteration 11: Added 0 new comments
Iteration 12: Added 0 new comments
Iteration 13: Added 0 new comments
Iteration 14: Added 0 new comments
Iteration 15: Added 0 new comments
Total collected comments: 74
Collected 8 comments about iPhone 17


  df_filtered = df[df["comment"].str.contains(r"(iphone\s*17|آیفون\s*17)", case=False, regex=True)]


Unnamed: 0,comment,date
5,Sitting here watching on my 13 mini with 71% b...,2025-12-24
7,The best base iPhone Apple has ever released. ...,2025-12-24
22,Currently switching from my 13 mini to an iPho...,2025-12-24
28,Why did you get the black iPhone 17 when you h...,2025-12-24
39,Traded in my 13 Pro Max and got $320 off a 512...,2025-12-24


In [9]:
# Convert collected comments to DataFrame (raw data)
df = pd.DataFrame(list(comments), columns=["content"])

# Add data collection date
df["date"] = datetime.now().date()

# Define output file path for raw data
output_path = "youtube_raw.csv"

# Save raw data to CSV
df.to_csv(output_path, index=False)

# Log successful save
print(f"Saved {len(df)} posts to {output_path}")

# Preview data
df.head()



Saved 74 posts to youtube_raw.csv


Unnamed: 0,content,date
0,Be careful with how many treats the cat gets.....,2025-12-24
1,I’m hoping they bring back the plus. Im using ...,2025-12-24
2,"Got my I Phone 17 2 days ago, my battery exper...",2025-12-24
3,Watching on my 17.,2025-12-24
4,"Adding 12 GB of RAM, studio-quality mics, and ...",2025-12-24
