In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('RR_company.csv',encoding='latin-1')
cn = list(data[data['headquarter_country']=='China']['name'])

Some names are accompanied with their former names, which may result in unsuccessful webscraping, so all content within the parentheses are removed.

In [None]:
for i in range(len(cn)):
  if '(' in cn[i]:
    idx_beg = cn[i].find('(')
    cn[i] = cn[i][:idx_beg]

print(cn)

### Start scraping

In [None]:
### uncomment the commands below if selenium is not installed
# !pip install selenium
# !apt-get update 
# !apt install chromium-chromedriver

from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
driver =webdriver.Chrome('chromedriver',chrome_options=chrome_options)

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |▍                               | 10kB 24.2MB/s eta 0:00:01[K     |▊                               | 20kB 23.9MB/s eta 0:00:01[K     |█                               | 30kB 12.1MB/s eta 0:00:01[K     |█▌                              | 40kB 9.7MB/s eta 0:00:01[K     |█▉                              | 51kB 8.0MB/s eta 0:00:01[K     |██▏                             | 61kB 7.6MB/s eta 0:00:01[K     |██▌                             | 71kB 8.5MB/s eta 0:00:01[K     |███                             | 81kB 9.3MB/s eta 0:00:01[K     |███▎                            | 92kB 8.1MB/s eta 0:00:01[K     |███▋                            | 102kB 7.6MB/s eta 0:00:01[K     |████                            | 112kB 7.6MB/s eta 0:00:01[K     |████▍                           | 122kB 7.6M

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [None]:
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")

Since this is implemented on Google Colab and pretty time-consuming, it is safer to break the list of names into chunks and store them to avoid inconvenience caused by runtime limit and other issues.

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
name_chunks = list(chunks(cn,100))

In [None]:
cnt = 10
for chunk in name_chunks[9:]:
  dict_code = {}
  for name in chunk:
    driver =webdriver.Chrome('chromedriver',options=chrome_options)
    driver.get('https://finance.yahoo.com/')

    WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@name='yfin-usr-qry']"))).send_keys(name)
    try:
      yahoo_fin_auto_suggestions = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="header-search-form"]/div[2]/div[1]/div/ul[2]/li')))
      code = yahoo_fin_auto_suggestions[0].text[:9]
      dict_code[name] = code
      print(code)
    except:
      dict_code[name] = 'Code Not Found'
    driver.quit()
  df = pd.DataFrame(dict_code, index=[0]).T
  df.columns = ['code']
  df.to_excel('chunk_{}.xlsx'.format(cnt))  
  cnt += 1


### Clean scraped data

In [None]:
df = pd.read_excel('chunk_1.xlsx')
df.columns = ['company_name','code']
for i in range(2,17):
  tmp = pd.read_excel('chunk_{}.xlsx'.format(i))
  tmp.columns = ['company_name','code']
  df = df.append(tmp, ignore_index=True)
df

Unnamed: 0,company_name,code
0,Kangmei Pharmaceutical Co Ltd,600518.SS
1,Jiangsu Xiuqiang Glasswork Co Ltd,300160.SZ
2,Henan Dayou Energy Co Ltd,600403.SS
3,Joyvio Agriculture Development Co Ltd,300268.SZ
4,AAC Technologies Holdings Inc,Edited Tr
...,...,...
1560,Beijing Honggao Creative Architectural Design ...,002504.SZ
1561,Jiangling Motors Corp Ltd,Code Not Found
1562,Leshi Internet Information & Technology Corp,Code Not Found
1563,AutoNavi Holdings Ltd,Code Not Found


### Scrape from Google Finance

In [None]:
import time # WebDriverWait is actually a much better alternative but this does the job

In [None]:
for i in range(1,17):
  tmp_df = pd.read_excel('chunk_{}.xlsx'.format(i))
  tmp_df.columns = ['company_name','code']
  dict_code = {}
  for j in range(len(tmp_df)):
    code = tmp_df['code'].iloc[j]
    company = tmp_df['company_name'].iloc[j]

    if code[-2:] == 'SZ' or code[-2:] == 'SS':
       dict_code[company] = code
    else:
      try:
        driver = webdriver.Chrome('chromedriver',options=chrome_options)
        driver.get('https://google.com/finance')  
        search = driver.find_element_by_xpath('//*[@id="search-bar"]')
        search.send_keys(company)
        search.send_keys(Keys.RETURN)
        time.sleep(1.5) 
        new_code = driver.current_url[37:]
        dict_code[company] = new_code
        print(new_code)
      except:
        dict_code[company] = 'Not Found'
    driver.quit()
  df = pd.DataFrame(dict_code, index=[0]).T
  df.columns = ['code']
  df.to_excel('new_chunk_{}.xlsx'.format(i))  
  
  

In [None]:
df = pd.read_excel('new_chunk_1.xlsx')
for i in range(2,17):
  tmp = pd.read_excel('new_chunk_{}.xlsx'.format(i))
  df = df.append(tmp, ignore_index=True)

In [None]:
df.to_excel('full_rr_company.xlsx')

Companies that still have no code (cannot find on Google or Yahoo Finance) might need more effort, so I attempted to scrape from MarketScreener but there are still about 200 companies that cannot be matched with a code, and I decided that this is good enough for now.<br>
Below is the code used for scraping from MarketScreener.

In [None]:
scraped_comp = pd.read_excel('full_rr_company.xlsx')
# scraped_comp = scraped_comp[['company','code']].set_index('company')

In [None]:
# dict_code = scraped_comp.to_dict()['code']
# dict_code

In [None]:
# cnt=0
# for comp, code in dict_code.items():
#   if type(code)!=str :
#     cnt+=1
# print('There are still {} companies with no code'.format(cnt))
# scrape.head()

There are still 270 companies with no code


In [None]:
# cnt = 0
# for comp, code in dict_code.items():
#   cnt += 1
#   print(cnt)
#   if type(code)!=str:
#     try:
#       print(comp)
#       driver = webdriver.Chrome('chromedriver',options=chrome_options)
#       driver.get('https://www.marketscreener.com/')  
#       search = driver.find_element_by_xpath('//*[@id="autocomplete"]')
#       search.send_keys(comp)
#       search.send_keys(Keys.RETURN)
#       result = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ALNI0"]/tbody/tr[2]/td[1]')))
#       new_code = result.text
#       exc = driver.find_element_by_xpath('//*[@id="ALNI0"]/tbody/tr[2]/td[7]').text
#       if exc.lower()[:5] == 'shang':
#         new_code = new_code+'.SH'
#       elif exc.lower()[:5] == 'shenz':
#         new_code = new_code+'.SZ'
#       else:
#         new_code = new_code +'.'+ exc
#       dict_code[comp] = new_code
#       print(new_code)
#     except:
#       continue

### Add the scraped code as a new column to the original RepRisk data

In [None]:
rr_company_cn = data[data['headquarter_country']=='China']
rr_company_cn['code'] = list(scraped_comp['code'])
rr_company_cn.to_csv('RepRisk_Chinese_Companies_info.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Get intersection

In [None]:
code = scraped_comp['code']

In [None]:
syntao = pd.read_excel('SynTao_clean.xlsx')
st_code = list(syntao.code)
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 
print('There are {} companies that both RepRisk and Syntao include that can be used for further comparison'.format(len(intersection(code,st_code))))

There are 384 companies that both RepRisk and Syntao include that can be used for further comparison
