## Load libraries and set directories

In [1]:
# Import libraries
import requests, os #, sys
import pandas as pd
from bs4 import BeautifulSoup
print("All packages loaded.")

All packages loaded.


In [2]:
# Set directories
path = "C:/Users/Matthias/Documents/GithubRepos/inventor_migration"
os.chdir(path)
print("Directories specified")

Directories specified


## Select largest countries

In [3]:
# Access Wikipedia:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
wiki_response = requests.get(url)
if wiki_response != []:
    print("Wikipedia page loaded")

Wikipedia page loaded


In [4]:
# find wikitables:
soup = BeautifulSoup(wiki_response.content, "html.parser")
wiki_tables = soup.find("table",{"class":"wikitable sortable"})

# extract countries
links = wiki_tables.find_all("a")
countries = []
for link in links:
    countries.append(link.get("title"))
    
# clean countries and take the 60 first countries
countries = [x for x in countries if x != None]
countries.remove("China")
countries.remove("Hong Kong")
countries = countries[1:61]
countries.sort()
print("Retrieved the", len(countries), "largest economies based on IMF 2020 estimates.")
countries

Retrieved the 60 largest economies based on IMF 2020 estimates.


['Algeria',
 'Argentina',
 'Australia',
 'Austria',
 'Bangladesh',
 'Belgium',
 'Brazil',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Czech Republic',
 'Denmark',
 'Egypt',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Israel',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Kuwait',
 'Malaysia',
 'Mexico',
 'Morocco',
 'Netherlands',
 'New Zealand',
 'Nigeria',
 'Norway',
 'Pakistan',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Qatar',
 'Republic of Ireland',
 'Romania',
 'Russia',
 'Saudi Arabia',
 'Singapore',
 'Slovakia',
 'South Africa',
 'South Korea',
 'Spain',
 'Sweden',
 'Switzerland',
 'Taiwan',
 'Thailand',
 'Turkey',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Vietnam']

## Get the largest companies for these countries

In [5]:
# get firms of country x from wikipedia:
def get_wiki_page(country):
    url = "https://en.wikipedia.org/wiki/List_of_companies_of_"+country
    wiki_response = requests.get(url)
    soup = BeautifulSoup(wiki_response.content, "html.parser")
    return(soup)

In [6]:
# find correct table with "notable firms"
def get_wiki_table(soup):
    captions = soup.find_all("caption")
    if len(captions) != 1:
        for title in captions:
            if title.get_text() == "Notable companies ":
                caption = title
        firm_table = caption.find_parent("table", {"class":"wikitable sortable"})
    else:
        firm_table = captions[0].find_parent("table", {"class":"wikitable sortable"})
    return(firm_table)

In [7]:
# get firms and build dataset
def get_firms(firm_table, country):
    rows = firm_table.find_all("tr")[1:] # get all rows expcept the header row
    firms = []
    for row in rows:
        firm = row.find_all("td")[0].get_text() # get firm name
        firm = firm.replace("\n", "")
        firms.append(firm)
    
    df = {"organization": firms, "country_firm": [country.replace("the", "")] * len(firms)}
    df = pd.DataFrame(df)
    return(df)

Iran does not have caption and Israel is only a list not a table. Further, the UK, Japan, South Korea, Taiwan, Thailand, Turkey and Russia are of a different structure. Do them manually afterwards.

In [8]:
# clean the country list 
countries[11] = "the Czech Republic"
countries[31] = "the Netherlands"
countries[37] = "the Philippines"
countries[41] = "the Republic of Ireland"
countries[56] = "the United Arab Emirates"

diff_countries = ["Israel", "United Kingdom", "Japan", "South Korea", "Taiwan", "Thailand", "Turkey", "Russia", "Iran"]
countries = [x for x in countries if x not in diff_countries]
countries.remove("United States")

In [9]:
# retrieve companies for all countries
df = pd.DataFrame()

for COUNTRY in countries:
    
    # access wikipedia:
    SOUP = get_wiki_page(country = COUNTRY)
    
    # get table of firms:
    FIRM_TABLE = get_wiki_table(soup = SOUP)
    
    # extract firms and build dataset
    FIRM_DF = get_firms(firm_table = FIRM_TABLE, country = COUNTRY)
    
    # add to df:
    df = pd.concat([df, FIRM_DF])

df.head

<bound method NDFrame.head of                     organization country_firm
0                    Air Algérie      Algeria
1            Air Express Algeria      Algeria
2   Algerian Petroleum Institute      Algeria
3                Algérie Ferries      Algeria
4               Antinea Airlines      Algeria
..                           ...          ...
77                        Vinare      Vietnam
78                       Vinatex      Vietnam
79                Vinavico Group      Vietnam
80               VNG Corporation      Vietnam
81                    World Auto      Vietnam

[6392 rows x 2 columns]>

In [372]:
# get firms for the remaining countries:
diff_countries

['Israel',
 'United Kingdom',
 'Japan',
 'South Korea',
 'Taiwan',
 'Thailand',
 'Turkey',
 'Russia',
 'Iran']

In [49]:
# Israel:
soup = get_wiki_page("Israel")
firm_lists = soup.find_all("ul")
companies = []
for list_element in firm_lists[1:24]:
    links = list_element.find_all("a")
    for link in links:
        firm = link.get_text()
        companies.append(firm)
companies = {"organization": companies, "country_firm": ["Israel"] * len(companies)}
companies = pd.DataFrame(companies)
df = pd.concat([df, companies])

In [68]:
# United Kingdom
soup = get_wiki_page("the United Kingdom_A-J")
firm_lists = soup.find_all("ul")
companies = []
for list_element in firm_lists[2:13]:
    links = list_element.find_all("a")
    for link in links:
        firm = link.get_text()
        companies.append(firm)
soup = get_wiki_page("the_United_Kingdom_K-Z")
firm_lists = soup.find_all("ul")
for list_element in firm_lists[2:18]:
    links = list_element.find_all("a")
    for link in links:
        firm = link.get_text()
        companies.append(firm)
companies = {"organization": companies, "country_firm": ["United Kingdom"] * len(companies)}
companies = pd.DataFrame(companies)
df = pd.concat([df, companies])

In [93]:
# Japan
soup = get_wiki_page("Japan")
firm_tables = soup.find_all("table", {"class":"wikitable"})
companies = []
for table in firm_tables[1:]:
    rows = table.find_all("tr")[1:]
    for row in rows:
        firm = row.find("td").get_text()
        firm = firm.replace("\n", "")
        companies.append(firm)
companies = {"organization": companies, "country_firm": ["Japan"] * len(companies)}
companies = pd.DataFrame(companies)
df = pd.concat([df, companies])

In [167]:
# South Korea
SOUP = get_wiki_page("South Korea")
def firm_diff_countries(soup, country):
    firm_tables = soup.find_all("table", {"class":"wikitable sortable"})[1:][0]
    rows = firm_tables.find_all("tr")[1:]
    companies = []
    for row in rows:
        firm = row.find("td").get_text()
        firm = firm. replace("\n", "")
        companies.append(firm)
    companies = {"organization": companies, "country_firm": [country] * len(companies)}
    companies = pd.DataFrame(companies)
    return(companies)
companies = firm_diff_countries(soup = SOUP, country = "South Korea")
df = pd.concat([df, companies])

Unnamed: 0,organization,country_firm
0,Air Algérie,Algeria
1,Air Express Algeria,Algeria
2,Algerian Petroleum Institute,Algeria
3,Algérie Ferries,Algeria
4,Antinea Airlines,Algeria
...,...,...
211,XGI Technology,Taiwan
212,Yageo,Taiwan
213,Yang Ming Marine Transport Corporation,Taiwan
214,Yulon,Taiwan


In [174]:
def firm_diff_countries2(soup, country):
    firm_tables = soup.find("table", {"class":"wikitable sortable"})
    rows = firm_tables.find_all("tr")[1:]
    companies = []
    for row in rows:
        firm = row.find("td").get_text()
        firm = firm. replace("\n", "")
        companies.append(firm)
    companies = {"organization": companies, "country_firm": [country] * len(companies)}
    companies = pd.DataFrame(companies)
    return(companies)

# Taiwan
companies = firm_diff_countries2(soup = get_wiki_page("Taiwan"), country = "Taiwan")
df = pd.concat([df, companies])


In [169]:
# Thailand
companies = firm_diff_countries(soup = get_wiki_page("Thailand"), country = "Thailand")
df = pd.concat([df, companies])
df

Unnamed: 0,organization,country_firm
0,Air Algérie,Algeria
1,Air Express Algeria,Algeria
2,Algerian Petroleum Institute,Algeria
3,Algérie Ferries,Algeria
4,Antinea Airlines,Algeria
...,...,...
154,True Corporation,Thailand
155,UBIS,Thailand
156,Wellcom,Thailand
157,Workpoint Entertainment,Thailand


In [171]:
# Turkey
companies = firm_diff_countries(soup = get_wiki_page("Turkey"), country = "Turkey")
df = pd.concat([df, companies])
df

Unnamed: 0,organization,country_firm
0,Air Algérie,Algeria
1,Air Express Algeria,Algeria
2,Algerian Petroleum Institute,Algeria
3,Algérie Ferries,Algeria
4,Antinea Airlines,Algeria
...,...,...
110,Yapi Merkezi,Turkey
111,Yapi ve Kredi Bankasi,Turkey
112,Yüksel Tohumculuk,Turkey
113,Ziraat Bankasi,Turkey


In [201]:
soup = get_wiki_page("Russia")
firm_tables = soup.find_all("table", {"class":"wikitable"})
companies = []
for table in firm_tables[1:]:
    rows = table.find_all("tr")[1:]
    for row in rows:
        firm = row.find("td").get_text()
        firm = firm.replace("\n", "")
        companies.append(firm)
companies = {"organization": companies, "country_firm": ["Russia"] * len(companies)}
companies = pd.DataFrame(companies)
df = pd.concat([df, companies])
df

Unnamed: 0,organization,country_firm
0,Air Algérie,Algeria
1,Air Express Algeria,Algeria
2,Algerian Petroleum Institute,Algeria
3,Algérie Ferries,Algeria
4,Antinea Airlines,Algeria
...,...,...
181,X5 Retail Group,Russia
182,Yandex,Russia
183,Yuganskneftegaz,Russia
184,Zerich Capital Management,Russia


In [219]:
# Iran
soup = get_wiki_page("Iran")
rows = soup.find_all("table")[0].find_all("tr")[1:]
del rows[161]
companies = []
for row in rows:
    firm = row.find_all("td")[1].get_text().replace("*", "")
    companies.append(firm)
companies = {"organization": companies, "country_firm": ["Iran"] * len(companies)}
companies = pd.DataFrame(companies)
df = pd.concat([df, companies])
df

Unnamed: 0,organization,country_firm
0,Air Algérie,Algeria
1,Air Express Algeria,Algeria
2,Algerian Petroleum Institute,Algeria
3,Algérie Ferries,Algeria
4,Antinea Airlines,Algeria
...,...,...
254,IDRO Group,Iran
255,Iran Road Maintenance & Transportation Organiz...,Iran
256,Pars Wagon,Iran
257,"Iran Cultural Heritage, Handicrafts and Touris...",Iran


## Save the company list 

In [223]:
df.to_csv(path+"/Data/patent_data/non_US_firms.csv", index = False)
print("Dataset saved. Extracted", len(df), "non-US companies from Wikipedia.")

Dataset saved. Extracted 10143 non-US companies from Wikipedia.


#### Ressources

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all

https://medium.com/analytics-vidhya/web-scraping-wiki-tables-using-beautifulsoup-and-python-6b9ea26d8722

**Setting up virtual environments for jupyter**
https://stackoverflow.com/questions/58068818/how-to-use-jupyter-notebooks-in-a-conda-environment
