# Import Required Libraries

In [11]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os

# Twitter Login Credentials

In [21]:
username = "your_username"
password = "your_password"

In [22]:
csv_name= "your_csv_name.csv"

# Function to Log in to Twitter

In [14]:
def twitter_login(username, password, driver):
    # Navigate to Twitter login page
    driver.get("https://twitter.com/login")

    try:
        # Wait for the username input to be clickable and input the username
        username_field = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@name='text']"))
        )
        username_field.send_keys(username)
        username_field.send_keys(Keys.RETURN)

        # Wait for the password input to be clickable and input the password
        password_field = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@name='password']"))
        )
        password_field.send_keys(password)
        password_field.send_keys(Keys.RETURN)

        # Wait for login to complete
        time.sleep(10)  # Adjust as needed for the login process to complete
        print("Login successful!")

    except Exception as e:
        print(f"Error during login: {e}")

# Function to Scrape Comments from a Tweet

In [19]:
def scrape_tweet_comments(tweet_url, original_post_text, driver):
    try:
        # Extract the user profile and tweet ID from the URL
        url_parts = tweet_url.split('/')
        user_profile = url_parts[3]
        tweet_id = url_parts[-1].split('?')[0].split('#')[0]  # Remove any query parameters or hash fragments

        # Create a folder named after the user profile to store results
        if not os.path.exists(user_profile):
            os.makedirs(user_profile)

        # Navigate to the tweet's page
        driver.get(tweet_url)

        # Wait for the tweet page to load
        time.sleep(17)

         # **Remove this part since we're passing the original_post_text directly**
        #if you donot want to read post text from csv you can uncomment these lines and remove original_post_text from comment parameter
        # original_post_text = WebDriverWait(driver, 20).until(
        #   EC.presence_of_element_located((By.XPATH, '//article[@data-testid="tweet"]//div[@data-testid="tweetText"]'))
        # ).text

        # Get the main post's datetime and split into date and time
        main_post_datetime = driver.find_element(By.XPATH, '//article[@data-testid="tweet"]//time').get_attribute('datetime')
        main_post_date, main_post_time = main_post_datetime.split('T')
        main_post_time = main_post_time.split('.')[0]  # Remove any microseconds and the 'Z'

        # Ensure date format is year-month-day
        main_post_date = main_post_date.replace("/", "-")

        # Create a unique filename using tweet ID and date
        output_filename = os.path.join(user_profile, f"{tweet_id}_{main_post_date}.csv")

        # Open the CSV file in write mode
        with open(output_filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Write the header
            writer.writerow(["tweet_link", "post_text", "main_post_date", "main_post_time", "comment_user_id", "comment_text", "comment_date", "comment_time"])

            last_position = driver.execute_script("return window.pageYOffset;")
            scrolling = True

            # Set to track already seen comments
            seen_comments = set()

            while scrolling:
                # Scrape comments
                comments = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]')

                # Skip the first element since it is the original post
                for comment in comments[1:]:
                    try:
                        # Extract the unique identifier for the comment (e.g., comment time)
                        comment_id = comment.find_element(By.XPATH, './/time').get_attribute('datetime')

                        # Skip the comment if it has already been processed
                        if comment_id in seen_comments:
                            continue

                        # Add the comment ID to the seen set
                        seen_comments.add(comment_id)

                        # Extract the user ID and comment text
                        user_id = comment.find_element(By.XPATH, './/div[@dir="ltr"]//span').text
                        tweet_text_element = comment.find_elements(By.XPATH, './/div[@data-testid="tweetText"]')
                        comment_datetime = comment.find_element(By.XPATH, './/time').get_attribute('datetime')

                        # Check if there is text content, otherwise handle media or retweets
                        if tweet_text_element:
                            comment_text = tweet_text_element[0].text
                        else:
                            comment_text = "[No text found, possibly media or retweet]"

                        # Split comment datetime into date and time
                        comment_date, comment_time = comment_datetime.split('T')
                        comment_time = comment_time.split('.')[0]  # Remove any microseconds and the 'Z'

                        # Ensure date format is year-month-day
                        comment_date = comment_date.replace("/", "-")

                        # Print the user ID and comment text to the console
                        print(f"User ID: {user_id}, Comment: {comment_text}")

                        # Write the data to the CSV file
                        writer.writerow([tweet_url, original_post_text, main_post_date, main_post_time, user_id, comment_text, comment_date, comment_time])

                    except Exception as e:
                        print(f"Error extracting comment: {e}")

                # Scroll down the page to load more comments
                driver.execute_script("window.scrollBy(0, 1000);")  # Adjust scroll amount as needed
                time.sleep(3)  # Adjust time to wait for comments to load

                # Check the new scroll position
                new_position = driver.execute_script("return window.pageYOffset;")
                if new_position == last_position:
                    scrolling = False  # Stop scrolling if the page does not move
                last_position = new_position

    except Exception as e:
        print(f"Error scraping tweet {tweet_url}: {e}")
        return False
    
    return True  # Return True if scraping was successful

# Login and Read Write CSV Files 

In [18]:
# Initialize the WebDriver
driver = webdriver.Chrome()

#Login to twitter
twitter_login(username, password, driver)


#*****
with open(csv_name, mode='r', newline='', encoding='utf-8-sig') as csvfile:
    reader = csv.reader(csvfile)
    header = next(reader)  
    rows_to_keep = []
    
    with open('processed_tweets.csv', mode='a', newline='', encoding='utf-8-sig') as processed_file:
        writer = csv.writer(processed_file)

        for row in reader:
            tweet_url = row[1]  # Assuming the tweet URL is in the second column you can change accordingly
            original_post_text = row[2]  # Assuming the tweet text is in the third column you can change accordingly

            # Attempt to scrape comments for the tweet
            if scrape_tweet_comments(tweet_url, original_post_text, driver):
                writer.writerow(row)  
            else:
                rows_to_keep.append(row)  

#Overwrite Original CSV File with Unprocessed Rows
# ----------------------------
with open(csv_name, mode='w', newline='', encoding='utf-8-sig') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)  
    
    for row in rows_to_keep:
        writer.writerow(row)  

#*****


# Close the Browser
# ----------------------------
driver.quit()

Login successful!
User ID: vuwevuwe, Comment: Yg lain cs nya gercep, ini lama bgt. Rata rata 5 menit 40 detik sekali bales chat.  Udh ketolong sama pas pertama itu, setelah itu 7 menit keatas sekali bales. M702 KODE NYA
User ID: AXIS, Comment: Halo, Kak. Maaf yaa atas keterlambatan responsnya. Mohon kirimkan nomor kakak dan nomor laporan kembali ke AXIS Care melalui aplikasi AXISnet atau LiveChat di http://axis.co.id/LC yaa. Kakak juga bisa menghubungi call center di 838. Terima kasih 
User ID: Martini Novalina, Comment: Dari kemarin sampai sekarang ketika saya isi kuata bronet di axisnet  masih aja di bilang terjadi kendala ketika aktivasi,trus dari kemarin juga  datang pesan. permintaan aktivasi sedang di proses sampai sekarang gak adatuh bisa saya isi paket,tolong dong min,saya udh isi kuota
User ID: AXIS, Comment: Halo, Kak. Maaf yaa atas kendala yang dialami. Kakak bisa kirimkan nomor kakak dan detail pembelian tersebut ke AXIS Care di aplikasi AXISnet atau melalui fitur LiveChat 

In [20]:
"""
with open('XYZ.csv', mode='r', newline='', encoding='utf-8-sig') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)  # Skip the header if there is one
    for row in reader:
        tweet_url = row[1]  # Assuming the tweet URL is in the second column
        original_post_text = row[2]  # Assuming the tweet text is in the third column
        scrape_tweet_comments(tweet_url, original_post_text)
"""
#If you donot want above section of csv write and read to delete the processed tweets from original file and write in new processed_tweets csv file 
#you can simple change above cell code start from (#*****) to (#*****) with this code.
#this code simply process all rows from csv file without affecting the original csv file.

"\nwith open('XYZ.csv', mode='r', newline='', encoding='utf-8-sig') as csvfile:\n    reader = csv.reader(csvfile)\n    next(reader, None)  # Skip the header if there is one\n    for row in reader:\n        tweet_url = row[1]  # Assuming the tweet URL is in the second column\n        original_post_text = row[2]  # Assuming the tweet text is in the third column\n        scrape_tweet_comments(tweet_url, original_post_text)\n"