In [2]:
import requests
import pdfplumber
from openpyxl import Workbook
import pandas as pd
import cpca
from chinese_province_city_area_mapper import drawers
from IPython.display import IFrame

In [3]:
# 下载 PDF 文件
pdf_2019q4 = 'http://www.csrc.gov.cn/pub/newsite/scb/ssgshyfljg/202001/W020200110325952653089.pdf'
r = requests.get(pdf_2019q4, stream=True)
with open("CSRC_2019q4.pdf", "wb") as pdf:
    for content in r:
            pdf.write(content)

In [4]:
# 提取并写入Excel
wb = Workbook()
ws = wb.active
with pdfplumber.open("CSRC_2019q4.pdf") as pdf:
    for page in pdf.pages:
        for table in page.extract_tables():
            for row in table:
                ws.append(row)
wb.save("CSRC_2019q4.xlsx")

In [None]:
# 使用 Stata 数据整理
# * ssc install nrow, replace
# * ssc install carryforward, replace

# import excel using "CSRC_201914.xlsx", clear
# duplicates drop
# nrow
# carryforward _all,replace
# gen 行业门类与大类 = ustrregexs(0) + 行业大类代码 if ustrregexm(门类名称及代码,"[A-Z]") == 1
# rename (上市公司代码 上市公司简称 行业门类与大类 门类名称及代码) (Stkcd Stknme Nnindnme Nnindcd)
# drop 行业大类代码 行业大类名称
# lab var Stkcd "上市公司代码"
# lab var Stknme "上市公司简称"
# lab var Nnindcd "行业代码"
# lab var Nnindnme "行业名称"
# save "CSRC_2019q4_ok.dta", replace

# import excel using "IPO_Cobasic.xlsx", firstrow clear
# duplicates drop Stkcd, force 
# lab var Listdt "上市时间"
# lab var Estbdt "成立时间"
# lab var Regadd "注册地址"
# save "IPO_Cobasic.dta", replace

# use "CSRC_2019q4_ok.dta", clear
# merge 1:1 Stkcd using "IPO_Cobasic.dta", keep(3) nogen
# save "CSRC_2019q4_ok.dta", replace




In [13]:
# 读取整理后的数据
df = pd.read_stata("CSRC_2019q4_ok.dta")

In [14]:
df.head()

Unnamed: 0,Nnindnme,Stkcd,Stknme,Nnindcd,Listdt,Estbdt,Regadd,Agri_m
0,金融业(J),1,平安银行,J66,1991-04-03,1987-12-22,广东省深圳市深南中路178号深圳发展银行大厦,0.0
1,房地产业(K),2,万科A,K70,1991-01-29,1984-05-30,广东省深圳市罗湖区翠竹北水贝二路27号,0.0
2,制造业(C),4,国农科技,C27,1991-01-14,1986-05-05,广东省深圳市南山区蛇口公园路4号青少年活动中心B座三楼,0.0
3,水利、环境和公共\n设施管理业(N),5,世纪星源,N77,1990-12-10,1987-07-23,广东省深圳市人民南路发展中心大厦13楼,0.0
4,房地产业(K),6,深振业A,K70,1992-04-27,1989-05-25,广东省深圳市宝安南路振业大厦29-32层,0.0


In [15]:
# 拆分为省-市-区-地址
location = cpca.transform(df.Regadd)

In [16]:
location.head()

Unnamed: 0,省,市,区,地址,adcode
0,广东省,深圳市,,发展银行大厦,440300
1,广东省,深圳市,罗湖区,翠竹北水贝二路27号,440303
2,广东省,深圳市,南山区,蛇口公园路4号青少年活动中心B座三楼,440305
3,广东省,深圳市,,人民南路发展中心大厦13楼,440300
4,广东省,深圳市,,宝安南路振业大厦29-32层,440300


In [17]:
# 绘图
drawers.draw_locations(location, "location.html")

In [47]:
# 嵌入 HTML
IFrame('location.html', width=1000, height=800)

In [21]:
# 农产品加工企业
agri = df[df.Agri_m == 1]
agri.head()

Unnamed: 0,Nnindnme,Stkcd,Stknme,Nnindcd,Listdt,Estbdt,Regadd,Agri_m
37,制造业(C),48,康达尔,C13,1994-11-01,1994-09-21,广东省深圳市福田区滨河大道下步庙东侧,1.0
96,制造业(C),488,晨鸣纸业,C22,2000-11-20,1993-05-05,山东省寿光市圣城路595号,1.0
102,制造业(C),505,京粮控股,C13,1992-12-21,1992-01-11,海南省海口经昆北路2号龙珠城1栋,1.0
105,制造业(C),509,华塑控股,C29,1993-05-07,1990-03-01,四川省南充市涪江路117号,1.0
120,制造业(C),529,广弘控股,C13,1993-11-18,1992-07-05,广东省鹤山市沙坪镇人民西路40号,1.0


In [23]:
agri_location = cpca.transform(agri.Regadd)

In [24]:
drawers.draw_locations(agri_location, "agri_location.html")

In [25]:
IFrame('agri_location.html', width=1000, height=800)

In [29]:
# 利用高德地图API获取经纬度
import requests
import json

# 输入API问号前固定不变的部分
url='https://restapi.amap.com/v3/geocode/geo'
# 将两个参数放入字典
params = {'key': 'e2fac07f3a44b4f64fdaed667310152c',
          'address': '张家港市锦丰镇三兴沿江公路',
          'city': '张家港市'}

res = requests.get(url, params)
res.text

'{"status":"1","info":"OK","infocode":"10000","count":"1","geocodes":[{"formatted_address":"江苏省苏州市张家港市沿江公路","country":"中国","province":"江苏省","citycode":"0512","city":"苏州市","district":"张家港市","township":[],"neighborhood":{"name":[],"type":[]},"building":{"name":[],"type":[]},"adcode":"320582","street":[],"number":[],"location":"120.667326,31.980671","level":"道路"}]}'

In [30]:
# 输出结果为json，将其转为字典格式
jd = json.loads(res.text)

In [31]:
jd

{'status': '1',
 'info': 'OK',
 'infocode': '10000',
 'count': '1',
 'geocodes': [{'formatted_address': '江苏省苏州市张家港市沿江公路',
   'country': '中国',
   'province': '江苏省',
   'citycode': '0512',
   'city': '苏州市',
   'district': '张家港市',
   'township': [],
   'neighborhood': {'name': [], 'type': []},
   'building': {'name': [], 'type': []},
   'adcode': '320582',
   'street': [],
   'number': [],
   'location': '120.667326,31.980671',
   'level': '道路'}]}

In [32]:
#经纬度
coords = jd['geocodes'][0]['location']

In [33]:
coords

'120.667326,31.980671'