## Beautiful Soup Method

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Get the page
url = "https://en.wikipedia.org/wiki/List_of_companies_of_Cambodia"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Step 2: Find all tables with class 'wikitable'
tables = soup.find_all("table", {"class": "wikitable"})

# Step 3: Convert each table to a DataFrame and store
all_tables = []
for i, table in enumerate(tables):
    try:
        df = pd.read_html(str(table))[0]
        df["Source Table"] = f"Table {i+1}"  # Optional: track origin
        all_tables.append(df)
    except Exception as e:
        print(f"Skipping table {i+1} due to error: {e}")

# Step 4: Combine all tables into one DataFrame
combined_df = pd.concat(all_tables, ignore_index=True)

# Step 5: Display or save
print(combined_df.head())  # Just show first 5 rows

# Optional: Save to CSV
combined_df.to_csv("all_cambodia_company_tables.csv", index=False)


                Name           Industry            Sector Headquarters  \
                Name           Industry            Sector Headquarters   
0   ABOUTAsia Travel  Consumer services  Travel & tourism    Siem Reap   
1        ACLEDA Bank         Financials             Banks   Phnom Penh   
2     Angkor Airways  Consumer services          Airlines   Phnom Penh   
3     ANZ Royal Bank         Financials             Banks   Phnom Penh   
4  Cambodia Airlines  Consumer services          Airlines   Phnom Penh   

  Founded                                         Notes             Status  \
  Founded                                         Notes Unnamed: 6_level_1   
0    2007  Destination management company, founded 2007                  P   
1    1993                      Domestic commercial bank                  P   
2    2004                         Airline, defunct 2008                  P   
3    2005         Private bank, part of ANZ (Australia)                  P   
4    1997    

  df = pd.read_html(str(table))[0]


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Getting HTML source code using requests

In [5]:
url = "https://en.wikipedia.org/wiki/List_of_companies_of_Cambodia"
request = requests.get(url)
# print(request.text)

## convert HTML source code to Beautiful Soup Object

In [6]:
soup = BeautifulSoup(markup = request.text,features =  "html.parser")
print(soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of companies of Cambodia - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-en

## compare the available methods in Requests and BS4

In [7]:
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'DEFAULT_INTERESTING_STRING_TYPES',
 'EMPTY_ELEMENT_EVENT',
 'END_ELEMENT_EVENT',
 'ROOT_TAG_NAME',
 'START_ELEMENT_EVENT',
 'STRING_ELEMENT_EVENT',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_clone',
 '_decode_markup',
 '_event_stream',
 '_feed',
 '_find_all',
 '_find_one',
 '_format_tag',
 '_indent_string',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_marku

In [8]:
# tables = soup.find_all(name='table') #find all tables
tables = soup.find(name='table', attrs={"class": "wikitable"})
len(tables)

4

In [9]:
rows = tables.find_all(name='tr') #find all rows in the table
len(rows[2:])

36

In [10]:
row1 = rows[2]
print(row1)

<tr style="background:#ffffff;">
<td><a href="/wiki/ABOUTAsia_Travel" title="ABOUTAsia Travel">ABOUTAsia Travel</a>
</td>
<td>Consumer services
</td>
<td>Travel &amp; tourism
</td>
<td><a href="/wiki/Siem_Reap" title="Siem Reap">Siem Reap</a>
</td>
<td>2007
</td>
<td>Destination management company, founded 2007
</td>
<td style="text-align:center;">P
</td>
<td style="text-align:center;">A
</td></tr>


In [11]:
cols = row1.find_all(name='td') #find all columns in the row
len(cols)

8

In [12]:
col1 = cols[0]
print(col1)

<td><a href="/wiki/ABOUTAsia_Travel" title="ABOUTAsia Travel">ABOUTAsia Travel</a>
</td>


In [13]:
col1.text.strip() #get the text in the column and remove any leading or trailing spaces using strip()

'ABOUTAsia Travel'

## All-in-one Loop to Extract table

In [14]:
data = [] #create an empty list to store the data
for tr in table.find_all("tr")[2:]:
    # tds = tr.find_all(name = "td")
    row = [td.text.strip() for td in tr.find_all(name = "td")]
    data.append(row)
print(len(data))
print(data) 

36
[['ABOUTAsia Travel', 'Consumer services', 'Travel & tourism', 'Siem Reap', '2007', 'Destination management company, founded 2007', 'P', 'A'], ['ACLEDA Bank', 'Financials', 'Banks', 'Phnom Penh', '1993', 'Domestic commercial bank', 'P', 'A'], ['Angkor Airways', 'Consumer services', 'Airlines', 'Phnom Penh', '2004', 'Airline, defunct 2008', 'P', 'D'], ['ANZ Royal Bank', 'Financials', 'Banks', 'Phnom Penh', '2005', 'Private bank, part of ANZ (Australia)', 'P', 'A'], ['Cambodia Airlines', 'Consumer services', 'Airlines', 'Phnom Penh', '1997', 'Airline, defunct 2014', 'P', 'D'], ['Cambodia Angkor Air', 'Consumer services', 'Airlines', 'Phnom Penh', '2009', 'Flag carrier', 'S', 'A'], ['Cambodia Asia Bank', 'Financials', 'Banks', 'Phnom Penh', '1993', 'Bank', 'P', 'A'], ['Cambodia Commercial Bank', 'Financials', 'Banks', 'Phnom Penh', '1991', 'Commercial bank, part of Siam Commercial Bank (Thailand)', 'P', 'A'], ['Cambodian National Insurance Company', 'Financials', 'Life insurance', 'Phn

In [15]:
df = pd.DataFrame(data=data)
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,ABOUTAsia Travel,Consumer services,Travel & tourism,Siem Reap,2007,"Destination management company, founded 2007",P,A
1,ACLEDA Bank,Financials,Banks,Phnom Penh,1993,Domestic commercial bank,P,A
2,Angkor Airways,Consumer services,Airlines,Phnom Penh,2004,"Airline, defunct 2008",P,D
3,ANZ Royal Bank,Financials,Banks,Phnom Penh,2005,"Private bank, part of ANZ (Australia)",P,A
4,Cambodia Airlines,Consumer services,Airlines,Phnom Penh,1997,"Airline, defunct 2014",P,D
5,Cambodia Angkor Air,Consumer services,Airlines,Phnom Penh,2009,Flag carrier,S,A
6,Cambodia Asia Bank,Financials,Banks,Phnom Penh,1993,Bank,P,A
7,Cambodia Commercial Bank,Financials,Banks,Phnom Penh,1991,"Commercial bank, part of Siam Commercial Bank ...",P,A
8,Cambodian National Insurance Company,Financials,Life insurance,Phnom Penh,1990,Private insurance,P,A
9,Cambrew Brewery,Consumer goods,Brewers,Sihanoukville,1965,Cambodia's largest beer manufacturer,P,A


In [16]:
url = "https://books.toscrape.com/index.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [19]:
lis = soup.find_all(name="li", attrs={"class": "col-xs-6"})
print(lis)

[<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>
</li>, <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="catalog

In [18]:
headers = [th.text.stip() for th in table.find_all('th')]

AttributeError: 'str' object has no attribute 'stip'

In [3]:
! pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cryptography>=37.0.0 (from scrapy)
  Downloading cryptography-44.0.3-cp39-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting pyOpenSSL>=22.0.0 (from scrapy)
  Downloading pyOpenSSL-25.0.0-py3-none-any.whl.metadata (16 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (fro


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
