In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import List, Dict

# Headers to avoid bot detection
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

# Lists to store our data
all_companies_data = []

print("✓ Headers configured and data storage ready!")


✓ Headers configured and data storage ready!


In [23]:
# This gets the HTML content from the first page
page_number = 1
url = f'https://www.ambitionbox.com/list-of-companies?page={page_number}'

print(f"Fetching page {page_number}...")
response = requests.get(url, headers=headers)

if response.status_code == 200:
    html_content = response.text
    print(f"✓ Successfully fetched page {page_number}")
    print(f"HTML content length: {len(html_content)} characters")
else:
    print(f"✗ Failed to fetch page. Status: {response.status_code}")


Fetching page 1...
✓ Successfully fetched page 1
HTML content length: 362135 characters


In [24]:
# Parse the HTML and find all company information
soup = BeautifulSoup(html_content, 'lxml')

# Find company names
company_cards = soup.find_all('h2', class_='companyCardWrapper__companyName')
print(f"Found {len(company_cards)} company names")

# Find ratings
ratings = soup.find_all('div', class_='rating_text')
print(f"Found {len(ratings)} ratings")

# Store basic company info
companies_page1 = []
for idx, card in enumerate(company_cards):
    company_name = card.get_text(strip=True)
    company_rating = ratings[idx].text.strip() if idx < len(ratings) else 'N/A'
    
    companies_page1.append({
        'company_name': company_name,
        'rating': company_rating
    })

print(f"\n✓ Extracted {len(companies_page1)} companies from page 1")
print("\nFirst 5 companies:")
for i, comp in enumerate(companies_page1[:5], 1):
    print(f"  {i}. {comp['company_name']} - Rating: {comp['rating']}")


Found 20 company names
Found 20 ratings

✓ Extracted 20 companies from page 1

First 5 companies:
  1. TCS - Rating: 3.4
  2. Accenture - Rating: 3.7
  3. Wipro - Rating: 3.6
  4. Cognizant - Rating: 3.7
  5. Capgemini - Rating: 3.7


In [25]:
# Let's get detailed info for the first company to test
test_company = companies_page1[0]['company_name']
print(f"Testing with: {test_company}")

# Convert company name to URL format
# Example: "TCS" -> "tcs-overview"
company_slug = test_company.lower().replace(" ", "-") + "-overview"
detail_url = f"https://www.ambitionbox.com/overview/{company_slug}"

print(f"Fetching: {detail_url}")

# Get the company detail page
time.sleep(2)  # Wait 2 seconds to be polite
detail_response = requests.get(detail_url, headers=headers)

if detail_response.status_code == 200:
    print("✓ Successfully fetched company details")
    detail_html = detail_response.text
else:
    print(f"✗ Failed. Status: {detail_response.status_code}")


Testing with: TCS
Fetching: https://www.ambitionbox.com/overview/tcs-overview
✓ Successfully fetched company details


In [26]:
# Parse the detail page and extract information
detail_soup = BeautifulSoup(detail_html, 'lxml')

# Find all the data spans (these contain the values)
data_spans = detail_soup.find_all('span', class_='css-1jxf684 text-primary-text text-sm font-pn-600 flex-[6] md:flex-[auto]')

# Find all the labels
data_labels = detail_soup.find_all('span', class_='css-1jxf684 text-neutral-300 font-pn-600 text-sm tracking-[0.25px] min-w-[40%] flex-[5]')

print(f"Found {len(data_labels)} labels and {len(data_spans)} values")

# Initialize company details
company_details = {
    'company_name': test_company,
    'rating': companies_page1[0]['rating'],
    'founded_year': 'N/A',
    'india_employee_count': 'N/A',
    'global_employee_count': 'N/A',
    'india_headquarters': 'N/A',
    'global_headquarters': 'N/A',
    'website': 'N/A',
    'primary_industry': 'N/A'
}

# Match labels with values
for idx, label_elem in enumerate(data_labels):
    label = label_elem.get_text(strip=True)
    value = data_spans[idx].get_text(strip=True) if idx < len(data_spans) else 'N/A'
    
    print(f"{label}: {value}")
    
    if 'Founded' in label:
        company_details['founded_year'] = value
    elif 'India Employee Count' in label:
        company_details['india_employee_count'] = value
    elif 'Global Employee Count' in label:
        company_details['global_employee_count'] = value
    elif 'India Headquarters' in label:
        company_details['india_headquarters'] = value
    elif 'Website' in label:
        company_details['website'] = value
    elif 'Primary Industry' in label:
        company_details['primary_industry'] = value

print("\n✓ Extracted company details successfully")


Found 3 labels and 3 values
Founded in: 1968 (57 yrs old)
India Employee Count: 1 Lakh+
Global Employee Count: 1 Lakh+

✓ Extracted company details successfully


In [27]:
# Look for office location links
hq_links = detail_soup.find_all('a', href=True)

for link in hq_links:
    if 'offices' in link.get('href', ''):
        link_text = link.get_text(strip=True)
        # Check if this location is different from India HQ
        if link_text != company_details['india_headquarters'] and ',' in link_text:
            company_details['global_headquarters'] = link_text
            print(f"Found Global HQ: {link_text}")
            break

print("\n--- Final Company Details ---")
for key, value in company_details.items():
    print(f"{key}: {value}")


Found Global HQ: Mumbai, Maharashtra, India

--- Final Company Details ---
company_name: TCS
rating: 3.4
founded_year: 1968 (57 yrs old)
india_employee_count: 1 Lakh+
global_employee_count: 1 Lakh+
india_headquarters: N/A
global_headquarters: Mumbai, Maharashtra, India
website: N/A
primary_industry: N/A


In [28]:
# Now let's get details for first 5 companies
print("Getting details for first 5 companies...")
print("="*60)

detailed_companies = []

for idx, company in enumerate(companies_page1[:5], 1):
    print(f"\n[{idx}/5] Processing: {company['company_name']}")
    
    # Create URL
    company_slug = company['company_name'].lower().replace(" ", "-") + "-overview"
    detail_url = f"https://www.ambitionbox.com/overview/{company_slug}"
    
    try:
        # Fetch page
        time.sleep(3)  # Wait 3 seconds between requests
        detail_response = requests.get(detail_url, headers=headers)
        
        if detail_response.status_code != 200:
            print(f"  ✗ Failed to fetch. Status: {detail_response.status_code}")
            continue
        
        # Parse HTML
        detail_soup = BeautifulSoup(detail_response.text, 'lxml')
        
        # Extract data
        data_spans = detail_soup.find_all('span', class_='css-1jxf684 text-primary-text text-sm font-pn-600 flex-[6] md:flex-[auto]')
        data_labels = detail_soup.find_all('span', class_='css-1jxf684 text-neutral-300 font-pn-600 text-sm tracking-[0.25px] min-w-[40%] flex-[5]')
        
        # Initialize details
        details = {
            'company_name': company['company_name'],
            'rating': company['rating'],
            'founded_year': 'N/A',
            'india_employee_count': 'N/A',
            'global_employee_count': 'N/A',
            'india_headquarters': 'N/A',
            'global_headquarters': 'N/A',
            'website': 'N/A',
            'primary_industry': 'N/A'
        }
        
        # Match labels with values
        for i, label_elem in enumerate(data_labels):
            label = label_elem.get_text(strip=True)
            value = data_spans[i].get_text(strip=True) if i < len(data_spans) else 'N/A'
            
            if 'Founded' in label:
                details['founded_year'] = value
            elif 'India Employee Count' in label:
                details['india_employee_count'] = value
            elif 'Global Employee Count' in label:
                details['global_employee_count'] = value
            elif 'India Headquarters' in label:
                details['india_headquarters'] = value
            elif 'Website' in label:
                details['website'] = value
            elif 'Primary Industry' in label:
                details['primary_industry'] = value
        
        # Get global headquarters
        hq_links = detail_soup.find_all('a', href=True)
        for link in hq_links:
            if 'offices' in link.get('href', ''):
                link_text = link.get_text(strip=True)
                if link_text != details['india_headquarters'] and ',' in link_text:
                    details['global_headquarters'] = link_text
                    break
        
        detailed_companies.append(details)
        print(f"  ✓ Successfully extracted details")
        
    except Exception as e:
        print(f"  ✗ Error: {e}")
        continue

print(f"\n{'='*60}")
print(f"✓ Collected details for {len(detailed_companies)} companies")


Getting details for first 5 companies...

[1/5] Processing: TCS
  ✓ Successfully extracted details

[2/5] Processing: Accenture
  ✓ Successfully extracted details

[3/5] Processing: Wipro
  ✓ Successfully extracted details

[4/5] Processing: Cognizant
  ✓ Successfully extracted details

[5/5] Processing: Capgemini
  ✓ Successfully extracted details

✓ Collected details for 5 companies


In [29]:
# Let's get companies from multiple pages
print("Scraping pages 1 to 3...")
print("="*60)

all_companies = []

for page_num in range(1, 4):  # Pages 1, 2, 3
    print(f"\nFetching page {page_num}...")
    
    url = f'https://www.ambitionbox.com/list-of-companies?page={page_num}'
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"  ✗ Failed to fetch page {page_num}")
        continue
    
    # Parse page
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Extract companies
    company_cards = soup.find_all('h2', class_='companyCardWrapper__companyName')
    ratings = soup.find_all('div', class_='rating_text')
    
    for idx, card in enumerate(company_cards):
        company_name = card.get_text(strip=True)
        company_rating = ratings[idx].text.strip() if idx < len(ratings) else 'N/A'
        
        all_companies.append({
            'company_name': company_name,
            'rating': company_rating
        })
    
    print(f"  ✓ Found {len(company_cards)} companies on page {page_num}")
    time.sleep(2)  # Wait between pages

print(f"\n{'='*60}")
print(f"✓ Total companies collected: {len(all_companies)}")



Scraping pages 1 to 3...

Fetching page 1...
  ✓ Found 20 companies on page 1

Fetching page 2...
  ✓ Found 20 companies on page 2

Fetching page 3...
  ✓ Found 20 companies on page 3

✓ Total companies collected: 60


In [32]:
# This will take time, so we limit to first 10 companies
print("Getting detailed information...")
print("Processing first 10 companies (change this number as needed)")
print("="*60)

final_detailed_data = []

for idx, company in enumerate(all_companies[:60], 1):  # Change [:10] to [:None] for all
    print(f"\n[{idx}/10] {company['company_name']}")
    
    # Create URL slug
    company_slug = company['company_name'].lower().replace(" ", "-") + "-overview"
    detail_url = f"https://www.ambitionbox.com/overview/{company_slug}"
    
    try:
        # Fetch
        time.sleep(3)
        detail_response = requests.get(detail_url, headers=headers)
        
        if detail_response.status_code != 200:
            print(f"  ✗ Failed (Status: {detail_response.status_code})")
            continue
        
        # Parse
        detail_soup = BeautifulSoup(detail_response.text, 'lxml')
        data_spans = detail_soup.find_all('span', class_='css-1jxf684 text-primary-text text-sm font-pn-600 flex-[6] md:flex-[auto]')
        data_labels = detail_soup.find_all('span', class_='css-1jxf684 text-neutral-300 font-pn-600 text-sm tracking-[0.25px] min-w-[40%] flex-[5]')
        
        # Store details
        details = {
            'company_name': company['company_name'],
            'rating': company['rating'],
            'founded_year': 'N/A',
            'india_employee_count': 'N/A',
            'global_employee_count': 'N/A',
            'india_headquarters': 'N/A',
            'global_headquarters': 'N/A',
            'website': 'N/A',
            'primary_industry': 'N/A'
        }
        
        # Extract details
        for i, label_elem in enumerate(data_labels):
            label = label_elem.get_text(strip=True)
            value = data_spans[i].get_text(strip=True) if i < len(data_spans) else 'N/A'
            
            if 'Founded' in label:
                details['founded_year'] = value
            elif 'India Employee Count' in label:
                details['india_employee_count'] = value
            elif 'Global Employee Count' in label:
                details['global_employee_count'] = value
            elif 'India Headquarters' in label:
                details['india_headquarters'] = value
            elif 'Website' in label:
                details['website'] = value
            elif 'Primary Industry' in label:
                details['primary_industry'] = value
        
        # Global HQ
        hq_links = detail_soup.find_all('a', href=True)
        for link in hq_links:
            if 'offices' in link.get('href', ''):
                link_text = link.get_text(strip=True)
                if link_text != details['india_headquarters'] and ',' in link_text:
                    details['global_headquarters'] = link_text
                    break
        
        final_detailed_data.append(details)
        print(f"  ✓ Success")
        
    except Exception as e:
        print(f"  ✗ Error: {e}")

print(f"\n{'='*60}")
print(f"✓ Total detailed records: {len(final_detailed_data)}")


Getting detailed information...
Processing first 10 companies (change this number as needed)

[1/10] TCS
  ✓ Success

[2/10] Accenture
  ✓ Success

[3/10] Wipro


KeyboardInterrupt: 

In [31]:
# Convert our data to a pandas DataFrame
df = pd.DataFrame(final_detailed_data)

print("Data Preview:")
print("="*60)
print(df.head(10))
print("\n" + "="*60)
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


# Cell 12: Save to CSV File
# Save our scraped data to a CSV file
filename = 'ambitionbox_companies.csv'
df.to_csv(filename, index=False, encoding='utf-8')

print(f"✓ Data saved to: {filename}")
print(f"✓ Total records saved: {len(df)}")


# Cell 13: View Summary Statistics
print("="*60)
print("DATA SUMMARY")
print("="*60)

print(f"\nTotal Companies: {len(df)}")
print(f"\nColumn Names:")
for col in df.columns:
    print(f"  - {col}")

print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nRating Distribution:")
print(df['rating'].value_counts().head())

print("\n" + "="*60)

Data Preview:
    company_name rating       founded_year india_employee_count  \
0            TCS    3.4  1968 (57 yrs old)              1 Lakh+   
1      Accenture    3.7  1989 (36 yrs old)              1 Lakh+   
2          Wipro    3.6  1945 (80 yrs old)              1 Lakh+   
3      Cognizant    3.7  1994 (31 yrs old)              1 Lakh+   
4      Capgemini    3.7  1967 (58 yrs old)              1 Lakh+   
5      HDFC Bank    3.8  1994 (31 yrs old)              1 Lakh+   
6        Infosys    3.5                N/A                  N/A   
7     ICICI Bank    4.0  1994 (31 yrs old)              1 Lakh+   
8        HCLTech    3.4                N/A                  N/A   
9  Tech Mahindra    3.4  1986 (39 yrs old)              1 Lakh+   

  global_employee_count india_headquarters  \
0               1 Lakh+                N/A   
1               1 Lakh+                N/A   
2               1 Lakh+                N/A   
3               1 Lakh+                N/A   
4               1 