In [1]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

In [2]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. All changes under this directory will be kept even after reset. Please clean unnecessary files in time to speed up environment loading.
!ls /home/aistudio/work

In [3]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, you need to use the persistence path as the following:
!mkdir /home/aistudio/external-libraries
!pip install BeautifulSoup4 -t /home/aistudio/external-libraries

Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting BeautifulSoup4
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 9.9MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.1 soupsieve-2.0.1


In [4]:
!pip install lxml -t /home/aistudio/external-libraries
!pip install html5lib -t /home/aistudio/external-libraries

Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting lxml
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ba/39/0b5d76e64681243db516491bc449eff847d2708b465b60465b31ca13522e/lxml-4.5.1-cp37-cp37m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 53.5MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.1
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting html5lib
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl (112kB)
[K     |████████████████████████████████| 112kB 9.6MB/s eta 0:00:01
[?25hCollecting webencodings (from html5lib)
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl
Collecting six>=1.9 (from h

In [5]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可:
# Also add the following code, so that every time the environment (kernel) starts, just run the following code:
import sys
sys.path.append('/home/aistudio/external-libraries')

* atcoder是日本的编程竞赛平台，因为时差原因，比赛时间设置比codeforces更加友好
* 北京化工大学acm队员们也在atcoder上参加了很多的比赛
* atcoder并没有提供api数据接口，因此需要解析网页来爬取数据

### 用户atcoder参赛记录网站打开截图
### 网址：[https://atcoder.jp/users/Trebleb/history](https://atcoder.jp/users/Trebleb/history)
![](https://ai-studio-static-online.cdn.bcebos.com/4aaa6c2c098347889059120383786208b6e7833f221d4f02bfad95d9d29a01ff)


In [9]:
# pip install BeautifulSoup4

# 在开始导入相关库
from bs4 import BeautifulSoup
import requests
import json, time

# 自定义函数显示html网页文本
def getUrlText(url):
    while True:
        try:
            html = requests.get(url)
            html = html.text
            break
        except requests.exceptions.ConnectionError:
            print('ConnectionError -- please wait 3 seconds')
            time.sleep(3)
        except requests.exceptions.ChunkedEncodingError:
            print('ChunkedEncodingError -- please wait 3 seconds')
            time.sleep(3)    
        except:
            print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
            time.sleep(3)
    return html

# atcoder 
def getACUserData(acID):
    url = "https://atcoder.jp/users/"+acID+"/history"
    html = getUrlText(url)
    # soup = BeautifulSoup(html, features="lxml") # 使用lxml解析器进行解析，速度快
    soup = BeautifulSoup(html, features="html5lib") # 使用html5lib进行解析，容错性好，不依赖外部扩展
    
    # print(soup)

    # 根据网页html结构，先找到html中id为history的元素
    table = soup.select('#history')
    if len(table) > 0:
        t = table[0]
    else:
        return []

    # [dict1, dict2, ...]
    # dict:{'date': date, 'contest': contest, 'rank': rank, 'newRating': newRanking, 'diff':diff}
    data_list = []  

    ''' enumerate函数用法:
    enumerate是翻译过来是枚举的意思，看下它的方法原型：
    enumerate(sequence, start=0)，返回一个枚举对象。sequence必须是序列或迭代器iterator，或者支持迭代的对象。
    enumerate()返回对象的每个元素都是一个元组，每个元组包括两个值，一个是计数，一个是sequence的值，
    计数是从start开始的，start默认为0。
    '''

    # 选择t中所有的tr元素进行提取
    for idx, tr in enumerate(t.select('tr')): 
        if idx != 0:
            tds = tr.select('td')
            # 根据网页显示的字段进行提取
            date = tds[0].select('time')[0].text
            contest = tds[1].select('a')[0].text
            rank = tds[2].select('a')[0].text
            if len(tds[4].select('span')) > 0:
                newRating = tds[4].select('span')[0].text
            else:
                newRating = tds[4].text
            diff = tds[5].contents[0]
            # print(date,contest,rank,newRating,diff)
            data_list.append({
                'date': date,
                'contest': contest, 
                'rank': rank, 
                'newRating': newRating, 
                'diff':diff
            })

    return data_list

if __name__ == "__main__":
    acID = "Trebleb" #"a2018040538"
    dataList = getACUserData(acID)
    print(dataList)

[{'date': '2019-11-24 22:40:00+0900', 'contest': 'AtCoder Beginner Contest 146', 'rank': '3233', 'newRating': '24', 'diff': '-'}, {'date': '2019-12-22 22:40:00+0900', 'contest': 'AtCoder Beginner Contest 148', 'rank': '3851', 'newRating': '64', 'diff': '+40'}, {'date': '2020-01-10 22:40:00+0900', 'contest': 'AtCoder Beginner Contest 150', 'rank': '2534', 'newRating': '-', 'diff': '-'}, {'date': '2020-01-18 23:00:00+0900', 'contest': 'Keyence Programming Contest 2020', 'rank': '1503', 'newRating': '226', 'diff': '+162'}, {'date': '2020-01-19 22:30:00+0900', 'contest': 'AtCoder Beginner Contest 152', 'rank': '3611', 'newRating': '258', 'diff': '+32'}, {'date': '2020-01-26 22:40:00+0900', 'contest': 'AtCoder Beginner Contest 153', 'rank': '1824', 'newRating': '418', 'diff': '+160'}, {'date': '2020-02-16 22:40:00+0900', 'contest': 'AtCoder Beginner Contest 155', 'rank': '4369', 'newRating': '411', 'diff': '-7'}, {'date': '2020-02-22 22:40:00+0900', 'contest': 'AtCoder Beginner Contest 156'