# Birmingham Movers

# web scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

# Function to extract cleaned domain from a URL
def get_cleaned_domain(url):
    if not url:
        return ''
    url = url.strip().replace('http://', '').replace('https://', '').replace('www.', '')
    return url.split('/')[0]

# Base URL of the page to scrape
base_url = "https://www.superpages.com/birmingham-al/movers"

# Initialize a list to store extracted data
data_list = []

# Function to scrape a single page
def scrape_page(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    listings = soup.select('.organic .result')  # Adjusted selector

    for listing in listings:
        name_element = listing.select_one('.business-name')
        name = name_element.get_text(strip=True) if name_element else 'N/A'

        domain_element = listing.select_one('.weblink-button')
        domain = get_cleaned_domain(domain_element['href']) if domain_element and domain_element.has_attr('href') else 'N/A'

        phone_element = listing.select_one('.phone')
        phone = phone_element.get_text(strip=True) if phone_element else 'N/A'
        phone = re.sub(r'\D', '', phone)  # Remove any non-numeric characters

        address_element = listing.select_one('.adr')
        address = address_element.get_text(strip=True) if address_element else 'N/A'
        
        keyword_elements = listing.select('.categories a')
        keywords = [keyword.get_text(strip=True) for keyword in keyword_elements] if keyword_elements else []


        data_list.append({
            'Name': name,
            'Cleaned Domain': domain,
            'Phone Number': phone,
            'Address': address,
            'Keywords': keywords
        })

# Loop through the pages until no more results are found
page_number = 1
while True:
    page_url = f"{base_url}?page={page_number}"
    print(f"Scraping page {page_number}: {page_url}")
    previous_data_length = len(data_list)
    scrape_page(page_url)
    
    # Break the loop if no new data was added
    if len(data_list) == previous_data_length:
        break

    page_number += 1

# Convert the list of data to a DataFrame
df = pd.DataFrame(data_list)

# Save the data to JSON and CSV formats
df.to_json('movers.json', orient='records', indent=4)
df.to_csv('movers.csv', index=False)

print("Data successfully scraped and saved to 'movers.json' and 'movers.csv'")

Scraping page 1: https://www.superpages.com/birmingham-al/movers?page=1
Scraping page 2: https://www.superpages.com/birmingham-al/movers?page=2
Scraping page 3: https://www.superpages.com/birmingham-al/movers?page=3
Scraping page 4: https://www.superpages.com/birmingham-al/movers?page=4
Scraping page 5: https://www.superpages.com/birmingham-al/movers?page=5
Scraping page 6: https://www.superpages.com/birmingham-al/movers?page=6
Scraping page 7: https://www.superpages.com/birmingham-al/movers?page=7
Data successfully scraped and saved to 'movers.json' and 'movers.csv'


# DATA CLEANING

In [3]:
import numpy as np
import pandas as pd

In [4]:
data= pd.read_csv('movers.csv')
data.head(5)

Unnamed: 0,Name,Cleaned Domain,Phone Number,Address,Keywords
0,New Latitude Movers,newlatitudemovers.com,2058553737,"130 Industrial Drive, Birmingham, AL, 35211","['Movers', 'Relocation Service']"
1,Oakdale Moving & Storage,,8446814827,Serving the Birmingham Area,"['Movers', 'Movers-Commercial & Industrial', '..."
2,AIR 7 SEAS Transport Logistics Inc,air7seas.com,8884048124,Serving the Birmingham Area,"['Movers', 'Customs Consultants', 'Moving Serv..."
3,We Will Transport It,wewilltransportit.com,8443157733,Serving the Birmingham Area,"['Movers', 'Boat Equipment & Supplies', 'Trans..."
4,Good Moves Moving Systems Inc,,2054108527,"5821 Walnut Grove Rd, Birmingham, AL, 35215","['Movers', 'Movers & Full Service Storage', 'M..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            173 non-null    object
 1   Cleaned Domain  89 non-null     object
 2   Phone Number    173 non-null    int64 
 3   Address         173 non-null    object
 4   Keywords        173 non-null    object
dtypes: int64(1), object(4)
memory usage: 6.9+ KB


In [6]:
data.isnull().sum()

Name               0
Cleaned Domain    84
Phone Number       0
Address            0
Keywords           0
dtype: int64

In [7]:
data.isnull().sum()/len(data)

Name              0.000000
Cleaned Domain    0.485549
Phone Number      0.000000
Address           0.000000
Keywords          0.000000
dtype: float64

**There are 48% of  null values present in the cleaned domain column 
but I am not dropping the null values since it may affect the data**

In [9]:
data.duplicated().sum()

0

**there are no duplicate values present in the dataset**