In [2]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.15.2-py3-none-any.whl (10.2 MB)
     --------------------------------------- 10.2/10.2 MB 27.1 MB/s eta 0:00:00
Collecting trio~=0.17
  Downloading trio-0.23.1-py3-none-any.whl (448 kB)
     ------------------------------------- 448.3/448.3 kB 13.7 MB/s eta 0:00:00
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting sniffio>=1.3.0
  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.1.3-py3-none-any.whl (14 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.3/58.3 kB 3.0 MB/s eta 0:00:00
Installing collected packages: sniffio, outcome, h11, exceptiongroup, wsproto, trio, trio-websocket, seleni

In [14]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service

In [16]:
service = Service(executable_path=r'C:\Users\Guillaume\Desktop\chromedriver-win64\chromedriver.exe')
data = []
youtube_video_url = "https://www.youtube.com/watch?v=kuhhT_cBtFU&t=2s"

with webdriver.Chrome(service=service) as driver:
    wait = WebDriverWait(driver, 15)
    driver.get(youtube_video_url)

    # Scroll to load comments
    for item in range(200):  # Adjust the range as needed to load more comments
        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
        time.sleep(2)  # Adjust timing based on your internet speed and response time

    # Collect comment data
    comments = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//ytd-comment-thread-renderer")))
    
    for comment in comments:
        # Username
        try:
            author = comment.find_element(By.XPATH, ".//span[contains(text(),'@')]").text.strip()
        except NoSuchElementException:  # In case there is no username found
            author = "Unknown"

        # Comment text
        try:
            comment_text = comment.find_element(By.ID, 'content-text').text.strip()
        except NoSuchElementException:  # In case there is no text found
            comment_text = "No Text"

        # Comment date
        try:
            comment_date = comment.find_element(By.CSS_SELECTOR, "a.yt-simple-endpoint.style-scope.yt-formatted-string").text.strip()
        except NoSuchElementException:  # In case there is no date found
            comment_date = "Unknown Date"

        # Likes
        try:
            comment_likes = comment.find_element(By.ID, 'vote-count-middle').get_attribute('aria-label')
        except NoSuchElementException:  # In case there is no like count found
            comment_likes = '0'

        # Aggregate data in a dictionary and append to list
        data.append({
            'username': author,
            'comment': comment_text,
            'date': comment_date,
            'likes': comment_likes,
        })
        
        

In [33]:
# Create DataFrame and save to CSV
df = pd.DataFrame(data, columns=['username', 'comment', 'date', 'likes'])

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,username,comment,date,likes
0,@jackjolly789,I'm not from the US but I sure as hell know no...,il y a 3 ans,"160 ""J'aime"""
1,@datofficial6062,"We dont need his ""help to understand what we'r...",il y a 3 ans,"289 ""J'aime"""
2,@panthercub10,You know it odd but the more this happens the ...,il y a 3 ans,"1 mention ""J'aime"""
3,@jhartmggc,It’s tragic how this all unfolded. Unfortunate...,il y a 3 ans,
4,@DenverEight,"I remember in Captain Planet, Mati’s power of ...",il y a 3 ans,


In [24]:
# Save to CSV file
df.to_csv('youtube_comments.csv', index="username")

In [35]:
# Convert 'likes' column to string to use string methods
df['likes'] = df['likes'].astype(str)

# Now extract just the digits
df['likes'] = df['likes'].str.extract('(\d+)')[0]  # Extract numbers and get the first match series

# Convert 'likes' column to integer
df['likes'] = pd.to_numeric(df['likes'], errors='coerce').fillna(0).astype(int)  # Fill NaN with 0 and convert to int

# Handle transformation for the 'date' column - replacing "il y a" with "ago" and "ans" with "years"
# Using a regex replacement to handle both "an" (singular) and "ans" (plural)
df['date'] = df['date'].str.replace(r'il y a (\d+) (an|ans)', r'\1 years ago', regex=True)


In [36]:
# Now, the DataFrame should have 'date' in the format "X years ago" and 'likes' as integers
df.head()

Unnamed: 0,username,comment,date,likes
0,@jackjolly789,I'm not from the US but I sure as hell know no...,3 years agos,160
1,@datofficial6062,"We dont need his ""help to understand what we'r...",3 years agos,289
2,@panthercub10,You know it odd but the more this happens the ...,3 years agos,1
3,@jhartmggc,It’s tragic how this all unfolded. Unfortunate...,3 years agos,0
4,@DenverEight,"I remember in Captain Planet, Mati’s power of ...",3 years agos,0


In [25]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('youtube_comments_cleaned.csv', index=False)

In [27]:
# Display the cleaned DataFrame
df.head()

# Save to CSV file with the cleaned data

Unnamed: 0,username,comment,date,likes
0,@jackjolly789,I'm not from the US but I sure as hell know no...,3 ans years ago,160
1,@datofficial6062,"We dont need his ""help to understand what we'r...",3 ans years ago,289
2,@panthercub10,You know it odd but the more this happens the ...,3 ans years ago,1
3,@jhartmggc,It’s tragic how this all unfolded. Unfortunate...,3 ans years ago,0
4,@DenverEight,"I remember in Captain Planet, Mati’s power of ...",3 ans years ago,0
