# Carpet_rug_cleaners

# web scraping 

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

# Function to extract cleaned domain from a URL
def get_cleaned_domain(url):
    if not url:
        return ''
    url = url.strip().replace('http://', '').replace('https://', '').replace('www.', '')
    return url.split('/')[0]

# Base URL of the page to scrape
base_url = "https://www.superpages.com/new-york-ny/carpet-rug-cleaners"

# Initialize a list to store extracted data
data_list = []

# Function to scrape a single page
def scrape_page(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    listings = soup.select('.organic .result')  # Adjusted selector

    for listing in listings:
        name_element = listing.select_one('.business-name')
        name = name_element.get_text(strip=True) if name_element else 'N/A'

        domain_element = listing.select_one('.weblink-button')
        domain = get_cleaned_domain(domain_element['href']) if domain_element and domain_element.has_attr('href') else 'N/A'

        phone_element = listing.select_one('.phone')
        phone = phone_element.get_text(strip=True) if phone_element else 'N/A'
        phone = re.sub(r'\D', '', phone)  # Remove any non-numeric characters

        address_element = listing.select_one('.adr')
        address = address_element.get_text(strip=True) if address_element else 'N/A'
        
        keyword_elements = listing.select('.categories a')
        keywords = [keyword.get_text(strip=True) for keyword in keyword_elements] if keyword_elements else []


        data_list.append({
            'Name': name,
            'Cleaned Domain': domain,
            'Phone Number': phone,
            'Address': address,
            'Keywords': keywords
        })

# Loop through the pages until no more results are found
page_number = 1
while True:
    page_url = f"{base_url}?page={page_number}"
    print(f"Scraping page {page_number}: {page_url}")
    previous_data_length = len(data_list)
    scrape_page(page_url)
    
    # Break the loop if no new data was added
    if len(data_list) == previous_data_length:
        break

    page_number += 1

# Convert the list of data to a DataFrame
df = pd.DataFrame(data_list)

# Save the data to JSON and CSV formats
df.to_json('carpet_rug_cleaners.json', orient='records', indent=4)
df.to_csv('carpet_rug_cleaners.csv', index=False)

print("Data successfully scraped and saved to 'carpet_rug_cleaners.json' and 'carpet_rug_cleaners.csv'")

Scraping page 1: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=1
Scraping page 2: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=2
Scraping page 3: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=3
Scraping page 4: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=4
Scraping page 5: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=5
Scraping page 6: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=6
Scraping page 7: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=7
Scraping page 8: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=8
Scraping page 9: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=9
Scraping page 10: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=10
Scraping page 11: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=11
Scraping page 12: https://www.superpages.com/new-york-ny/carpet-rug-cleaners?page=1

# Data cleaning

In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('carpet_rug_cleaners.csv')
data.head(5)

Unnamed: 0,Name,Cleaned Domain,Phone Number,Address,Keywords
0,Blue Chip Building Maintenance Inc,bluechipclean.com,2125640100,"242 W 30th St Rm 700, New York, NY, 10001","['Carpet & Rug Cleaners', 'Janitorial Service'..."
1,Delmont Carpet Cleaning Inc,delmontcleanny.com,9292434945,"1636 3rd Ave, New York, NY, 10128","['Carpet & Rug Cleaners', 'Upholstery Cleaners..."
2,All Pro Cleaning & Restoration,allprorestoration.com,9143726244,"13 Haven St, Elmsford, NY, 10523","['Carpet & Rug Cleaners', 'Cleaning Contractor..."
3,Delmont Carpet Cleaning Inc,api.superpages.com,6464026590,"New York, NY, 10128",['Carpet & Rug Cleaners']
4,Cleantex - New York,cleantexny.com,2122831200,"3711 48th Ave, Long Island City, NY, 11101","['Carpet & Rug Cleaners', 'Cleaning Contractor..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783 entries, 0 to 782
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            783 non-null    object
 1   Cleaned Domain  385 non-null    object
 2   Phone Number    783 non-null    int64 
 3   Address         783 non-null    object
 4   Keywords        783 non-null    object
dtypes: int64(1), object(4)
memory usage: 30.7+ KB


In [5]:
data.isnull().sum()

Name                0
Cleaned Domain    398
Phone Number        0
Address             0
Keywords            0
dtype: int64

In [6]:
data.isnull().sum()/len(data)

Name              0.000000
Cleaned Domain    0.508301
Phone Number      0.000000
Address           0.000000
Keywords          0.000000
dtype: float64

**There is almost 50% data missing in cleaned domain column but not going to drop it because it will affect the dataset**

In [8]:
data.duplicated().sum()

0

**There is no duplicate values present in the data set**