In [3]:
pip install selenium beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [1]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# Function to scrape post URLs from the user's profile
def scrape_profile_posts(driver, username):
    driver.get("https://www.instagram.com/" + username + "/")
    time.sleep(3)

    post_urls = set()  # Use a set to prevent duplicates
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for the page to load

        try:
            # Get all post links
            post_links = driver.find_elements(By.XPATH, '//a[contains(@href, "/p/")]')
            for link in post_links:
                post_urls.add(link.get_attribute('href'))  # Add URLs to the set to avoid duplicates
        except:
            pass  # Ignore errors and continue

        # Break if no new posts are loaded
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return list(post_urls)

# Function to scrape comments for a given post
def scrape_comments(driver, post_url):
    driver.get(post_url)
    time.sleep(2)

    comments_data = []
    for _ in range(3):  # Adjust this range to load more comments
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    try:
        comments = driver.find_elements(By.CSS_SELECTOR, 'div._a9zr')

        for comment in comments:
            try:
                username = comment.find_element(By.CSS_SELECTOR, 'h3._a9zc a').text
                text = comment.find_element(By.CSS_SELECTOR, 'div._a9zs > span._ap3a').text
                comments_data.append({'username': username, 'comment': text})
            except:
                pass  # Ignore errors and continue
    except:
        pass  # Ignore errors and continue

    return comments_data

# Function to scrape likes and hashtags from a post
def scrape_post_details(driver, post_url):
    driver.get(post_url)
    time.sleep(3)  # Wait for the post page to load

    details = {}

    # Scrape likes
    try:
        likes_element = driver.find_element(By.XPATH, '//span[contains(@class, "xdj266r")]/ancestor::span[contains(@class, "x193iq5w")]/span')
        likes = likes_element.text.split(' ')[0]  # Extract only the number of likes
        details['likes'] = likes
    except:
        details['likes'] = "Not Available"

    # Scrape hashtags
    try:
        hashtag_elements = driver.find_elements(By.XPATH, '//a[contains(@href, "/explore/tags/")]')
        hashtags_list = [hashtag.text for hashtag in hashtag_elements]
        details['hashtags'] = hashtags_list
    except:
        details['hashtags'] = "Not Available"

    # Scrape comments
    comments = scrape_comments(driver, post_url)
    details['comments'] = comments

    return details

# Read Instagram IDs from a CSV file
def read_instagram_ids_from_csv(file_path):
    instagram_ids = []
    try:
        with open(file_path, mode='r') as file:
            csv_reader = csv.reader(file)
            for row in csv_reader:
                instagram_ids.append(row[0])  # Assuming usernames are in the first column
    except:
        pass  # Ignore errors and continue
    return instagram_ids

# Initialize WebDriver
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("https://www.instagram.com")

# Instagram login
time.sleep(5)
try:
    username_input = driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input')
    username_input.send_keys('tarun_._7')

    password_input = driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input')
    password_input.send_keys('Batsmen@07')

    login_button = driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button')
    login_button.click()

    # Wait for login to complete
    time.sleep(5)
except:
    pass  # Ignore errors and continue

# Read Instagram profiles from a CSV file
instagram_ids = read_instagram_ids_from_csv('fitness_vloggers.csv')  # Specify your CSV file name here

all_profiles_data = []

for username in instagram_ids:
    # Scrape post URLs from the profile
    post_urls = scrape_profile_posts(driver, username)

    post_data = {}

    # Scrape details for each post
    for index, post_url in enumerate(post_urls[:10]):
        details = scrape_post_details(driver, post_url)
        post_data[f'post_{index + 1}'] = {
            "post_url": post_url,
            **details  # Include likes, hashtags, and comments
        }

    profile_data = {"posts": post_data, "username": username}

    all_profiles_data.append(profile_data)

    time.sleep(3)

def save_to_csv(data, csv_file):
    # Define the headers for the CSV
    headers = ['username', 'post_number', 'post_url', 'likes', 'hashtags', 'comments']

    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)

        # Write the headers
        writer.writeheader()

        # Write the rows
        for profile in data:
            username = profile.get('username', 'N/A')

            posts = profile.get('posts', {})
            for post_num, post_info in posts.items():
                post_url = post_info.get('post_url', 'N/A')
                likes = post_info.get('likes', 'N/A')
                hashtags = ', '.join(post_info.get('hashtags', []))
                comments = '; '.join([f"{comment['username']}: {comment['comment']}" for comment in post_info.get('comments', [])])

                # Write each row corresponding to a post
                writer.writerow({
                    'username': username,
                    'post_number': post_num,
                    'post_url': post_url,
                    'likes': likes,
                    'hashtags': hashtags,
                    'comments': comments
                })

save_to_csv(all_profiles_data, 'instagram_profiles_full_data.csv')


In [4]:
from IPython.display import FileLink

FileLink('instagram_profiles_full_data.csv')