# Uber Ride Optimization - Alternative Scraping Methods

This notebook provides alternative methods to scrape or access Uber data, addressing issues like 406 status codes due to anti-bot measures. We avoid modifying the original `Data_Scrapping.ipynb`.

## Alternatives Covered:
1. **Uber Developer API**: Official and ethical way to get ride estimates (requires API key).
2. **Undetected-Chromedriver**: Enhanced Selenium to bypass bot detection.
3. **Direct HTTP Requests**: Mimic browser requests with proper headers.

Install dependencies: `pip install requests undetected-chromedriver selenium`

## 1. Undetected-Chromedriver

Uses undetected-chromedriver to make Selenium less detectable. Install: `pip install undetected-chromedriver`

In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import time, random

# Launch undetected Chrome
driver = uc.Chrome()

time.sleep(3)
# Navigate to Uber main page
driver.get('https://www.uber.com/global/en/price-estimate/')
driver.refresh()
# Wait for page to load
time.sleep(5)

# Click the login button
login_button = driver.find_element(By.XPATH, "//a[contains(text(), 'Log in')]")
login_button.click()
time.sleep(3)

# Click "Continue with Google" using the provided XPath
google_button = driver.find_element(By.XPATH, "//*[@id=\"google-login-btn\"]/div/div[2]/div/div[2]/p")
google_button.click()
time.sleep(3)

print("clicked continue with google..." )
# Switch to the Google login popup (if it's a new window)
driver.switch_to.window(driver.window_handles[-1])  # Switch to the last opened window
time.sleep(3)
print("window changed...." )

# Enter email
email_field = driver.find_element(By.ID, 'identifierId')
email_field.send_keys('muralboss@gmail.com')  # Replace with actual email
time.sleep(2)

print("entered the mail id..." )

# Click next
next_button = driver.find_element(By.ID, 'identifierNext')
next_button.click()
time.sleep(3)
print("clicked next..." )

# Enter password
password_field = driver.find_element(By.NAME, 'Passwd')
password_field.send_keys('murali1994')  # Replace with actual password
time.sleep(2)
print("entered the password..." )

# Click next
password_next_button = driver.find_element(By.ID, 'passwordNext')
password_next_button.click()
print("clicked next..." )

# Wait for login to complete and page to load
time.sleep(10)

# Switch back to main window if needed
driver.switch_to.window(driver.window_handles[0])

print('Logged in. Page title:', driver.title)


wait = WebDriverWait(driver, 15)

# --- Your 5 locations ---
locations = [
    "kempegowda international airport bengaluru",
    "Jayanagar",
    "Whitefield",
    "Indira Nagar",
    "Hebbal"
]

# --- Generate all (source, destination) pairs ---
all_pairs = []
for src in locations:
    possible_dests = [loc for loc in locations if loc != src]
    for dest in possible_dests:
        all_pairs.append((src, dest))

# --- Randomize order ---
random.shuffle(all_pairs)
print(f"Total pairs to scrape: {len(all_pairs)}")

# --- Prepare empty results list ---
records = []

for i, (source, destination) in enumerate(all_pairs, start=1):
    print(f"\nüîÑ [{i}] Source: {source} ‚Üí Destination: {destination}")

    # --- Navigate to price estimator page fresh each time ---
    #driver.get("https://www.uber.com/global/en/price-estimate/")
    time.sleep(random.uniform(5, 7))

    # --- Enter Pickup ---
    pickup_label = wait.until(EC.presence_of_element_located(
        (By.XPATH, "//label[contains(., 'Pickup location')]")
    ))
    pickup_input = pickup_label.find_element(By.XPATH, ".//following::input[1]")
    #pickup_input.clear()
    pickup_input.send_keys(source)
    time.sleep(2)
    pickup_input.send_keys(Keys.RETURN)

    # --- Enter Dropoff ---
    drop_label = wait.until(EC.presence_of_element_located(
        (By.XPATH, "//label[contains(., 'Dropoff location')]")
    ))
    drop_input = drop_label.find_element(By.XPATH, ".//following::input[1]")
    #drop_input.clear()
    drop_input.send_keys(destination)
    time.sleep(3)
    drop_input.send_keys(Keys.RETURN)
    time.sleep(2)

    # --- Click 'See prices' ---
    see_prices_btn = wait.until(
        EC.element_to_be_clickable((By.LINK_TEXT, "See prices"))
    )
    see_prices_btn.click()
    WebDriverWait(driver, 10)
    print("Clicked 'See prices'. Waiting for results...")

    # --- Wait for new page load ---
    time.sleep(8)

    # --- Get page source ---
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    wait = WebDriverWait(driver, 20)  # was 10 or 15 before

    # --- Wait for the ride options to load ---
    wait.until(EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Uber Go') or contains(text(), 'Go Non AC')]")))

    ride_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Uber Go') or contains(text(), 'Go Non AC')]/ancestor::div[3]")


    for ride in ride_elements:
        full_text = ride.text.split('\n')

        car_name = next((x for x in full_text if 'Uber Go' in x or 'Go Non AC' in x), 'N/A')
        price = next((x for x in full_text if '‚Çπ' in x), 'N/A')
        eta = next((x for x in full_text if 'away' in x), 'N/A')
        duration = next((x for x in full_text if re.match(r'\d{2}:\d{2}', x)), 'N/A')

        records.append({
            "Source": source,
            "Destination": destination,
            "Car_Type": car_name,
            "ETA": eta,
            "Trip_Duration": duration,
            "Price": price,
            "Scraped_At": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })


    print(f"‚úÖ Found {len(records)} rides so far.")

    driver.back() # this line of code will 
    time.sleep(5)


# --- Convert to DataFrame ---
df = pd.DataFrame(records)

# --- Create a date-based filename ---
today = datetime.now().strftime("%Y-%m-%d")
csv_file = f"uber_price_data_{today}.csv"

# --- Append if file exists, else create new ---
if os.path.exists(csv_file):
    df_existing = pd.read_csv(csv_file)
    df_combined = pd.concat([df_existing, df], ignore_index=True)
    df_combined.to_csv(csv_file, index=False)
    print(f"‚úÖ Appended {len(df)} new records to existing {csv_file}")
else:
    df.to_csv(csv_file, index=False)
    print(f"üÜï Created new file {csv_file} with {len(df)} records")

print("üíæ Data saved successfully.")


clicked continue with google...
window changed....
entered the mail id...
clicked next...
entered the password...
clicked next...
Logged in. Page title: Explore the Uber Platform | Earn & Ride | Uber
Total pairs to scrape: 20

üîÑ [1] Source: Whitefield ‚Üí Destination: Hebbal
Clicked 'See prices'. Waiting for results...
‚úÖ Found 1 rides so far.

üîÑ [2] Source: Jayanagar ‚Üí Destination: kempegowda international airport bengaluru
Clicked 'See prices'. Waiting for results...
‚úÖ Found 2 rides so far.

üîÑ [3] Source: kempegowda international airport bengaluru ‚Üí Destination: Whitefield
Clicked 'See prices'. Waiting for results...
‚úÖ Found 3 rides so far.

üîÑ [4] Source: Hebbal ‚Üí Destination: Indira Nagar
Clicked 'See prices'. Waiting for results...
‚úÖ Found 4 rides so far.

üîÑ [5] Source: Indira Nagar ‚Üí Destination: Jayanagar
Clicked 'See prices'. Waiting for results...
‚úÖ Found 5 rides so far.

üîÑ [6] Source: Indira Nagar ‚Üí Destination: kempegowda international air

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=52445): Read timed out. (read timeout=120)

## Function method. 

In [37]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import time, random

def scrape_uber_data():
    print("üöÄ Starting Uber scraper...")

    # --- Step 1: Launch undetected Chrome ---
    driver = uc.Chrome()
    driver.get('https://www.uber.com/global/en/price-estimate/')
    driver.refresh()
    time.sleep(5)

    # --- Step 2: Log in to Uber using Google ---
    try:
        login_button = driver.find_element(By.XPATH, "//a[contains(text(), 'Log in')]")
        login_button.click()
        time.sleep(3)

        google_button = driver.find_element(By.XPATH, "//*[@id='google-login-btn']/div/div[2]/div/div[2]/p")
        google_button.click()
        time.sleep(3)

        print("Clicked 'Continue with Google'...")
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(3)

        email_field = driver.find_element(By.ID, 'identifierId')
        email_field.send_keys('muralboss@gmail.com')
        time.sleep(2)
        driver.find_element(By.ID, 'identifierNext').click()
        time.sleep(3)

        password_field = driver.find_element(By.NAME, 'Passwd')
        password_field.send_keys('murali1994')
        time.sleep(2)
        driver.find_element(By.ID, 'passwordNext').click()
        time.sleep(10)

        driver.switch_to.window(driver.window_handles[0])
        print('‚úÖ Logged in successfully! Page title:', driver.title)
    except Exception as e:
        print("‚ö†Ô∏è Login failed:", e)
        driver.quit()
        return

    wait = WebDriverWait(driver, 25)  # increased to handle delays

    # --- Step 3: Define locations ---
    locations = [
        "kempegowda international airport bengaluru",
        "Jayanagar",
        "Whitefield",
        "Indira Nagar",
        "Hebbal"
    ]

    # --- Step 4: Generate all source-destination pairs ---
    all_pairs = [(src, dest) for src in locations for dest in locations if src != dest]
    random.shuffle(all_pairs)
    print(f"üîÅ Total routes to scrape: {len(all_pairs)}")

    # --- Step 5: Initialize results list ---
    records = []

    # --- Step 6: Scrape each pair ---
    for i, (source, destination) in enumerate(all_pairs, start=1):
        print(f"\nüõ£Ô∏è Route {i}: {source} ‚Üí {destination}")

        try:
            # Enter Pickup
            pickup_label = wait.until(EC.presence_of_element_located(
                (By.XPATH, "//label[contains(., 'Pickup location')]")
            ))
            pickup_input = pickup_label.find_element(By.XPATH, ".//following::input[1]")
            pickup_input.send_keys(source)
            time.sleep(2)
            pickup_input.send_keys(Keys.RETURN)

            # Enter Dropoff
            drop_label = wait.until(EC.presence_of_element_located(
                (By.XPATH, "//label[contains(., 'Dropoff location')]")
            ))
            drop_input = drop_label.find_element(By.XPATH, ".//following::input[1]")
            drop_input.send_keys(destination)
            time.sleep(3)
            drop_input.send_keys(Keys.RETURN)
            time.sleep(2)

            # Click 'See prices'
            see_prices_btn = wait.until(
                EC.element_to_be_clickable((By.LINK_TEXT, "See prices"))
            )
            see_prices_btn.click()
            print("Clicked 'See prices'... waiting for results.")

            # --- Smart wait for rides ---
            try:
                wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//*[contains(text(), 'Uber Go') or contains(text(), 'Go Non AC')]")
                    )
                )
            except:
                print(f"‚ö†Ô∏è Rides not loaded for {source} ‚Üí {destination}. Skipping...")
                driver.back()
                continue

            time.sleep(4)  # let rides stabilize

            # --- Extract rides info ---
            ride_elements = driver.find_elements(
                By.XPATH, "//*[contains(text(), 'Uber Go') or contains(text(), 'Go Non AC')]/ancestor::div[3]"
            )

            for ride in ride_elements:
                full_text = ride.text.split('\n')

                car_name = next((x for x in full_text if 'Uber Go' in x or 'Go Non AC' in x), 'N/A')
                price = next((x for x in full_text if '‚Çπ' in x), 'N/A')
                eta = next((x for x in full_text if 'away' in x), 'N/A')
                duration = next((x for x in full_text if re.match(r'\d{2}:\d{2}', x)), 'N/A')

                records.append({
                    "Source": source,
                    "Destination": destination,
                    "Car_Type": car_name,
                    "ETA": eta,
                    "Trip_Duration": duration,
                    "Price": price,
                    "Scraped_At": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

            print(f"‚úÖ Scraped {len(ride_elements)} rides for this route.")
            driver.back()
            time.sleep(5)

        except Exception as e:
            print(f"‚ùå Error on {source} ‚Üí {destination}: {e}")
            continue

    # --- Step 7: Save data ---
    if records:
        df = pd.DataFrame(records)
        # --- Save everything into one master CSV ---
        csv_file = "uber_price_data_master.csv"

        if os.path.exists(csv_file):
            df_existing = pd.read_csv(csv_file)
            df_combined = pd.concat([df_existing, df], ignore_index=True)
            df_combined.to_csv(csv_file, index=False)
            print(f"‚úÖ Appended {len(df)} new records to master file: {csv_file}")
        else:
            df.to_csv(csv_file, index=False)
            print(f"üÜï Created new master file: {csv_file} with {len(df)} records")

        print("üíæ All data saved to single master file successfully.")
    else:
        print("‚ö†Ô∏è No data collected this run.")

    # --- Step 8: Close driver ---
    driver.quit()
    print("‚úÖ Browser closed. Scraping completed.")










In [38]:
Data = scrape_uber_data()


üöÄ Starting Uber scraper...
Clicked 'Continue with Google'...
‚úÖ Logged in successfully! Page title: Explore the Uber Platform | Earn & Ride | Uber
üîÅ Total routes to scrape: 20

üõ£Ô∏è Route 1: Whitefield ‚Üí Jayanagar
Clicked 'See prices'... waiting for results.
‚úÖ Scraped 1 rides for this route.

üõ£Ô∏è Route 2: Indira Nagar ‚Üí Hebbal
Clicked 'See prices'... waiting for results.
‚úÖ Scraped 1 rides for this route.

üõ£Ô∏è Route 3: kempegowda international airport bengaluru ‚Üí Hebbal
Clicked 'See prices'... waiting for results.
‚úÖ Scraped 2 rides for this route.

üõ£Ô∏è Route 4: Whitefield ‚Üí kempegowda international airport bengaluru
Clicked 'See prices'... waiting for results.
‚úÖ Scraped 2 rides for this route.

üõ£Ô∏è Route 5: Whitefield ‚Üí Indira Nagar
Clicked 'See prices'... waiting for results.
‚úÖ Scraped 1 rides for this route.

üõ£Ô∏è Route 6: Indira Nagar ‚Üí kempegowda international airport bengaluru
Clicked 'See prices'... waiting for results.
‚úÖ Scrape

In [32]:
driver.quit()

## Data cleaning 

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("uber_price_data_master.csv")

# --- 1. Remove duplicates and NA records ---
df = df.drop_duplicates()
# drop a column 
df = df.drop('Trip_Duration', axis=1)


# --- 2. Clean 'Price' column ---
df["Price"] = (
    df["Price"]
    .astype(str)
    .str.replace("‚Çπ", "", regex=False)
    .str.replace(",", "", regex=False)
    .astype(float)
)


# --- 3. Car type column cleaning ---
# Clean 'Car_Type' column
df['Car_Type'] = df['Car_Type'].astype(str).str.strip()

# Map variations to standard values
df['Car_Type'] = df['Car_Type'].replace({
    'Uber Go4': 'Uber Go',
    'Uber Go 4': 'Uber Go',
    'Go Non AC4': 'Go Non AC',
    'Uber Go': 'Uber Go',
    'Request Uber Go' : 'Uber Go',
    'Request Go Non AC' : 'Go Non AC'
})

print("‚úÖ 'Car_Type' column cleaned successfully!")
print(df['Car_Type'].value_counts())


# 4. cleaning the missing values based upon time. 
df["Scraped_At"] = pd.to_datetime(df["Scraped_At"], errors="coerce")

# Extract date and time for grouping
df['Date'] = df['Scraped_At'].dt.date
df['Time'] = df['Scraped_At'].dt.time

# Function to fill missing values based on next day same time, or mode for same source/destination/car_type
def fill_missing_with_next_day_or_mode(df, column):
    # Identify missing rows
    missing_mask = df[column].isna()
    
    for idx in df[missing_mask].index:
        row = df.loc[idx]
        source = row['Source']
        destination = row['Destination']
        car_type = row['Car_Type']
        date = row['Date']
        time = row['Time']
        
        # Find next day same time
        next_day = date + pd.Timedelta(days=1)
        next_day_match = df[(df['Date'] == next_day) & (df['Time'] == time) & 
                           (df['Source'] == source) & (df['Destination'] == destination) & 
                           (df['Car_Type'] == car_type) & df[column].notna()]
        
        if not next_day_match.empty:
            df.at[idx, column] = next_day_match[column].iloc[0]
        else:
            # If no next day match, use mode for same source/destination/car_type
            mode_value = df[(df['Source'] == source) & (df['Destination'] == destination) & 
                           (df['Car_Type'] == car_type) & df[column].notna()][column].mode()
            if not mode_value.empty:
                df.at[idx, column] = mode_value.iloc[0]
    
    return df

# Apply to 'ETA' and 'Price' columns
df = fill_missing_with_next_day_or_mode(df, 'ETA')
df = fill_missing_with_next_day_or_mode(df, 'Price')

# Drop temporary columns if not needed
df.drop(['Date', 'Time'], axis=1, inplace=True)

print("‚úÖ Missing values in 'ETA' and 'Price' filled successfully!")
print(df[['ETA', 'Price']].isna().sum())

# --- 5. Extract ETA (minutes) ---

df['Destiantion_reach_time'] = df['ETA'].str.split('‚Ä¢').str[1]

df["ETA_min"] = (
    df["ETA"]
    .astype(str)
    .str.extract(r"(\d+)")[0]
    .astype(float)
)

# --- 6. Convert 'Scraped_At' to datetime ---
df["Scraped_At"] = pd.to_datetime(df["Scraped_At"], errors="coerce")
# get time seperarately from df['Scraped_At']
df['Scraped_At_time'] = df['Scraped_At'].dt.time 


print("‚úÖ Data cleaned successfully!")
df.head()


‚úÖ 'Car_Type' column cleaned successfully!
Car_Type
Uber Go      92
Go Non AC    89
Name: count, dtype: int64
‚úÖ Missing values in 'ETA' and 'Price' filled successfully!
ETA      0
Price    0
dtype: int64
‚úÖ Data cleaned successfully!


Unnamed: 0,Source,Destination,Car_Type,ETA,Price,Scraped_At,Destiantion_reach_time,ETA_min,Scraped_At_time
0,Indira Nagar,Jayanagar,Uber Go,1 min away‚Ä¢13:33,168.64,2025-10-28 12:48:46,13:33,1.0,12:48:46
1,Jayanagar,Hebbal,Uber Go,2 mins away‚Ä¢13:54,262.5,2025-10-28 12:50:19,13:54,2.0,12:50:19
2,Indira Nagar,kempegowda international airport bengaluru,Uber Go,1 min away‚Ä¢13:55,715.34,2025-10-28 12:51:53,13:55,1.0,12:51:53
3,Hebbal,kempegowda international airport bengaluru,Uber Go,3 mins away‚Ä¢13:27,526.75,2025-10-28 12:53:27,13:27,3.0,12:53:27
4,Indira Nagar,Whitefield,Uber Go,1 min away‚Ä¢13:38,253.64,2025-10-28 12:55:00,13:38,1.0,12:55:00


In [None]:
### FEATURE ENGINEERING


# Ensure 'Scraped_At' is datetime
df["Scraped_At"] = pd.to_datetime(df["Scraped_At"], errors="coerce")

# Create full datetime for arrival time (same date as scraped, with time from ETA)
df['Arrival_Time'] = pd.to_datetime(df['Scraped_At'].dt.date.astype(str) + ' ' + df['Destiantion_reach_time'], errors="coerce")

# Calculate trip duration in minutes
df['Trip_Duration_time'] = (df['Arrival_Time'] - df['Scraped_At']).dt.total_seconds() / 60

# Optional: Convert to int if needed
#df['Trip_Duration_time'] = df['Trip_Duration_time'].astype(int)



## AUTOMATE FIRST SO THAT YOU GET MORE RECORDS FOR MODELING 