# Assignment 3:WebScraping of  Marketing Automation Software Using Selenium

In [89]:
#MAIN CODE

import csv
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Path to your ChromeDriver executable
driver_path = '/Users/mouriya/Downloads/chromedriver-mac-arm64/chromedriver'
# Create a Service object with the path to ChromeDriver
service = Service(driver_path)

# Initialize the WebDriver with the service object
driver = webdriver.Chrome(service=service)

# Open the website
website = 'https://www.g2.com/categories/marketing-automation'
driver.get(website)

data = []

try:
    # Wait for the elements to be visible
    product_elements = WebDriverWait(driver, 10).until(
        EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.product-card'))
    )

    # Iterate over each product listing element
    for product_element in product_elements:
        try:
            # Extract software name
            product_name = product_element.find_element(By.CSS_SELECTOR, '.product-card__product-name a div').text

            # Extract users
            try:
                user_elements = product_element.find_elements(By.XPATH, './/div[contains(text(), "Users")]/following-sibling::ul/li')
                users = ', '.join([user.text for user in user_elements])
            except:
                users = 'No Users Info'

            # Extract market segment
            try:
                segment_elements = product_element.find_elements(By.XPATH, './/div[contains(text(), "Market Segment")]/following-sibling::ul/li')
                segment = ', '.join([segment.text for segment in segment_elements])
            except:
                segment = 'No Market Segment Info'

            # Extract rating
            try:
                rating_element = product_element.find_element(By.CSS_SELECTOR, '.c-midnight-90.pl-4th')
                rating = rating_element.text.strip()
            except:
                rating = 'No Rating Info'

            # Extract product link
            try:
                product_link_element = product_element.find_element(By.CSS_SELECTOR, '.product-card__product-name a')
                product_link = product_link_element.get_attribute('href')
            except:
                product_link = 'No Product Link'

            # Extract pricing information
            try:
                price_span = product_element.find_element(By.XPATH, './/span[contains(text(), "Entry Level Price:")]')
                pricing_element = price_span.find_element(By.XPATH, './following-sibling::a')
                pricing = pricing_element.text.strip()
            except:
                pricing = 'No Pricing'

            # Extract industries
            try:
                industries_elements = product_element.find_elements(By.XPATH, './/div[contains(text(), "Industries")]/following-sibling::ul/li')
                industries = ', '.join([industry.text for industry in industries_elements])
            except:
                industries = 'No Industries Info'

            # Append data to the list
            data.append({
                'Software Name': product_name,
                'Users': users,
                'Market Segment': segment,
                'Rating': rating,
                'Product Link': product_link,
                'Pricing': pricing,
                'Industries': industries
            })

            print(f"Product Name: {product_name}")
            print(f"Users: {users}")
            print(f"Market Segment: {segment}")
            print(f"Rating: {rating}")
            print(f"Product Link: {product_link}")
            print(f"Pricing: {pricing}")
            print(f"Industries: {industries}")

        except Exception as e:
            print(f"Error processing product: {e}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

# Close the browser
driver.quit()

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
df.to_csv('marketing_automation.csv', index=False, encoding='utf-8')

# Save the DataFrame as a JSON file
df.to_json('marketing_automation.json', orient='records', lines=True)


Product Name: HubSpot Marketing Hub
Users: , , Marketing Manager, Marketing Coordinator
Market Segment: , , 54% Small-Business, 40% Mid-Market
Rating: 4.4 out of 5
Product Link: https://www.g2.com/products/hubspot-marketing-hub/reviews
Pricing: No Pricing
Industries: , , Computer Software, Marketing and Advertising
Product Name: Insider
Users: , , Digital Marketing Manager, Digital Marketing Specialist
Market Segment: , , 45% Mid-Market, 29% Small-Business
Rating: 4.8 out of 5
Product Link: https://www.g2.com/products/insider/reviews
Pricing: No Pricing
Industries: , , Retail, Apparel & Fashion
Product Name: ActiveCampaign
Users: , , Owner, CEO
Market Segment: , , 93% Small-Business, 7% Mid-Market
Rating: 4.5 out of 5
Product Link: https://www.g2.com/products/activecampaign/reviews
Pricing: No Pricing
Industries: , , Marketing and Advertising, Health, Wellness and Fitness
Product Name: Braze
Users: , , CRM Manager, Marketing Manager
Market Segment: , , 58% Mid-Market, 23% Enterprise
Ra

# Data Cleaning

In [90]:
import pandas as pd

In [91]:
data= pd.read_csv('marketing_automation.csv')


In [92]:
df= data.copy()
df.head()

Unnamed: 0,Software Name,Users,Market Segment,Rating,Product Link,Pricing,Industries
0,HubSpot Marketing Hub,", , Marketing Manager, Marketing Coordinator",", , 54% Small-Business, 40% Mid-Market",4.4 out of 5,https://www.g2.com/products/hubspot-marketing-...,No Pricing,", , Computer Software, Marketing and Advertising"
1,Insider,", , Digital Marketing Manager, Digital Marketi...",", , 45% Mid-Market, 29% Small-Business",4.8 out of 5,https://www.g2.com/products/insider/reviews,No Pricing,", , Retail, Apparel & Fashion"
2,ActiveCampaign,", , Owner, CEO",", , 93% Small-Business, 7% Mid-Market",4.5 out of 5,https://www.g2.com/products/activecampaign/rev...,No Pricing,", , Marketing and Advertising, Health, Wellnes..."
3,Braze,", , CRM Manager, Marketing Manager",", , 58% Mid-Market, 23% Enterprise",4.5 out of 5,https://www.g2.com/products/braze/reviews,No Pricing,", , Financial Services, Marketing and Advertising"
4,Mailchimp All-in-One Marketing Platform,", , Owner, Marketing Manager",", , 72% Small-Business, 20% Mid-Market",4.4 out of 5,https://www.g2.com/products/mailchimp-all-in-o...,Free,", , Marketing and Advertising, Information Tec..."


In [104]:
df['Users'] = df['Users'].str.replace(', ,', '', regex=False)
df['Market Segment'] = df['Market Segment'].str.replace(', ,', '', regex=False)
df['Industries'] = df['Industries'].str.replace(', ,', '', regex=False)

In [106]:
df.head()

Unnamed: 0,Software Name,Users,Market Segment,Rating,Product Link,Pricing,Industries,User,MarketSegment
0,HubSpot Marketing Hub,"Marketing Manager, Marketing Coordinator","54% Small-Business, 40% Mid-Market",4.4 out of 5,https://www.g2.com/products/hubspot-marketing-...,No Pricing,"Computer Software, Marketing and Advertising",,"54% Small-Business, 40% Mid-Market"
1,Insider,"Digital Marketing Manager, Digital Marketing ...","45% Mid-Market, 29% Small-Business",4.8 out of 5,https://www.g2.com/products/insider/reviews,No Pricing,"Retail, Apparel & Fashion",,"45% Mid-Market, 29% Small-Business"
2,ActiveCampaign,"Owner, CEO","93% Small-Business, 7% Mid-Market",4.5 out of 5,https://www.g2.com/products/activecampaign/rev...,No Pricing,"Marketing and Advertising, Health, Wellness an...",,"93% Small-Business, 7% Mid-Market"
3,Braze,"CRM Manager, Marketing Manager","58% Mid-Market, 23% Enterprise",4.5 out of 5,https://www.g2.com/products/braze/reviews,No Pricing,"Financial Services, Marketing and Advertising",,"58% Mid-Market, 23% Enterprise"
4,Mailchimp All-in-One Marketing Platform,"Owner, Marketing Manager","72% Small-Business, 20% Mid-Market",4.4 out of 5,https://www.g2.com/products/mailchimp-all-in-o...,Free,"Marketing and Advertising, Information Technol...",,"72% Small-Business, 20% Mid-Market"


In [107]:
# Export DataFrame to CSV
df.to_csv('Maindata.csv', index=False)  

# Export DataFrame to JSON
df.to_json('Maindata.json', orient='records')


In [108]:
df.replace({'No Pricing': 'Pricing Unavailable'}, inplace=True)

In [109]:
df.head()

Unnamed: 0,Software Name,Users,Market Segment,Rating,Product Link,Pricing,Industries,User,MarketSegment
0,HubSpot Marketing Hub,"Marketing Manager, Marketing Coordinator","54% Small-Business, 40% Mid-Market",4.4 out of 5,https://www.g2.com/products/hubspot-marketing-...,Pricing Unavailable,"Computer Software, Marketing and Advertising",,"54% Small-Business, 40% Mid-Market"
1,Insider,"Digital Marketing Manager, Digital Marketing ...","45% Mid-Market, 29% Small-Business",4.8 out of 5,https://www.g2.com/products/insider/reviews,Pricing Unavailable,"Retail, Apparel & Fashion",,"45% Mid-Market, 29% Small-Business"
2,ActiveCampaign,"Owner, CEO","93% Small-Business, 7% Mid-Market",4.5 out of 5,https://www.g2.com/products/activecampaign/rev...,Pricing Unavailable,"Marketing and Advertising, Health, Wellness an...",,"93% Small-Business, 7% Mid-Market"
3,Braze,"CRM Manager, Marketing Manager","58% Mid-Market, 23% Enterprise",4.5 out of 5,https://www.g2.com/products/braze/reviews,Pricing Unavailable,"Financial Services, Marketing and Advertising",,"58% Mid-Market, 23% Enterprise"
4,Mailchimp All-in-One Marketing Platform,"Owner, Marketing Manager","72% Small-Business, 20% Mid-Market",4.4 out of 5,https://www.g2.com/products/mailchimp-all-in-o...,Free,"Marketing and Advertising, Information Technol...",,"72% Small-Business, 20% Mid-Market"


In [110]:
df.to_csv('Maindata.csv', index=False)  

# Export DataFrame to JSON
df.to_json('Maindata.json', orient='records')
