# 1. Thư viện và đường dẫn:

**1.1. Import thư viện:**

In [1]:
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
from requests_html import HTMLSession

from datetime import date, timedelta
import datetime
from datetime import timezone
import time

import pandas as pd

from pathlib import Path
import os
import json
import threading

**1.2. Tạo đường dẫn nếu chưa tồn tại:**

In [2]:
def createPathIfNotExist(path):
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)

**1.3. Đường dẫn chứa file notebook và thư mục data:**

In [3]:
notebookPath = os.getcwd()
dataPath = f'{notebookPath}/data'

createPathIfNotExist(notebookPath)
createPathIfNotExist(dataPath)

# 2. Các biến hỗ trợ:

**2.1. Định dạng ngày giờ:**

In [4]:
datetime_format = '%Y-%m-%d %H:%M:%S'

**2.2. Header cơ bản cho các lần request API:**

In [5]:
headers = {
    'Accepts': 'application/json',
    'X-CMC_PRO_API_KEY': '66879404-05fb-433b-b1b8-c6b2620f733c' # Đăng ký tài khoản để được cấp API KEY
}

**2.3. HTTP response status codes:**

- [HTTP response status codes by mozilla.org](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status)

In [6]:
http_status_code = {
    100 : 'Continue\nThis interim response indicates that the client should continue the request or ignore the response if the request is already finished.',
    101 : 'Switching Protocols\nThis code is sent in response to an Upgrade request header from the client and indicates the protocol the server is switching to.',
    102 : 'Processing (WebDAV)\nThis code indicates that the server has received and is processing the request, but no response is available yet.',
    103 : 'Early Hints\nThis status code is primarily intended to be used with the Link header, letting the user agent start preloading resources while the server prepares a response.',
    200 : 'OK\nThe request succeeded. The result meaning of "success" depends on the HTTP method:\n- GET: The resource has been fetched and transmitted in the message body.\n- HEAD: The representation headers are included in the response without any message body.\n- PUT or POST: The resource describing the result of the action is transmitted in the message body.\n- TRACE: The message body contains the request message as received by the server.',
    201 : 'Created\nThe request succeeded, and a new resource was created as a result. This is typically the response sent after POST requests, or some PUT requests.',
    202 : 'Accepted\nThe request has been received but not yet acted upon. It is noncommittal, since there is no way in HTTP to later send an asynchronous response indicating the outcome of the request. It is intended for cases where another process or server handles the request, or for batch processing.',
    203 : 'Non-Authoritative Information\nThis response code means the returned metadata is not exactly the same as is available from the origin server, but is collected from a local or a third-party copy. This is mostly used for mirrors or backups of another resource. Except for that specific case, the 200 OK response is preferred to this status.',
    204 : 'No Content\nThere is no content to send for this request, but the headers may be useful. The user agent may update its cached headers for this resource with the new ones.',
    205 : 'Reset Content\nTells the user agent to reset the document which sent this request.',
    206 : 'Partial Content\nThis response code is used when the Range header is sent from the client to request only part of a resource.',
    207 : 'Multi-Status (WebDAV)\nConveys information about multiple resources, for situations where multiple status codes might be appropriate.',
    208 : 'Already Reported (WebDAV)\nUsed inside a <dav:propstat> response element to avoid repeatedly enumerating the internal members of multiple bindings to the same collection.',
    226 : 'IM Used (HTTP Delta encoding)\nThe server has fulfilled a GET request for the resource, and the response is a representation of the result of one or more instance-manipulations applied to the current instance.',
    300 : 'Multiple Choices\nThe request has more than one possible response. The user agent or user should choose one of them. (There is no standardized way of choosing one of the responses, but HTML links to the possibilities are recommended so the user can pick.)',
    301 : 'Moved Permanently\nThe URL of the requested resource has been changed permanently. The new URL is given in the response.',
    302 : 'Found\nThis response code means that the URI of requested resource has been changed temporarily. Further changes in the URI might be made in the future. Therefore, this same URI should be used by the client in future requests.',
    303 : 'See Other\nThe server sent this response to direct the client to get the requested resource at another URI with a GET request.',
    304 : 'Not Modified\nThis is used for caching purposes. It tells the client that the response has not been modified, so the client can continue to use the same cached version of the response.',
    305 : 'Use Proxy Deprecated\nDefined in a previous version of the HTTP specification to indicate that a requested response must be accessed by a proxy. It has been deprecated due to security concerns regarding in-band configuration of a proxy.',
    306 : 'unused\nThis response code is no longer used; it is just reserved. It was used in a previous version of the HTTP/1.1 specification.',
    307 : 'Temporary Redirect\nThe server sends this response to direct the client to get the requested resource at another URI with same method that was used in the prior request. This has the same semantics as the 302 Found HTTP response code, with the exception that the user agent must not change the HTTP method used: if a POST was used in the first request, a POST must be used in the second request.',
    308 : 'Permanent Redirect\nThis means that the resource is now permanently located at another URI, specified by the Location: HTTP Response header. This has the same semantics as the 301 Moved Permanently HTTP response code, with the exception that the user agent must not change the HTTP method used: if a POST was used in the first request, a POST must be used in the second request.',
    400 : 'Bad Request\nThe server cannot or will not process the request due to something that is perceived to be a client error (e.g., malformed request syntax, invalid request message framing, or deceptive request routing).',
    401 : 'Unauthorized\nAlthough the HTTP standard specifies "unauthorized", semantically this response means "unauthenticated". That is, the client must authenticate itself to get the requested response.',
    402 : 'Payment Required Experimental\nThis response code is reserved for future use. The initial aim for creating this code was using it for digital payment systems, however this status code is used very rarely and no standard convention exists.',
    403 : "Forbidden\nThe client does not have access rights to the content; that is, it is unauthorized, so the server is refusing to give the requested resource. Unlike 401 Unauthorized, the client's identity is known to the server.",
    404 : 'Not Found\nThe server cannot find the requested resource. In the browser, this means the URL is not recognized. In an API, this can also mean that the endpoint is valid but the resource itself does not exist. Servers may also send this response instead of 403 Forbidden to hide the existence of a resource from an unauthorized client. This response code is probably the most well known due to its frequent occurrence on the web.',
    405 : 'Method Not Allowed\nThe request method is known by the server but is not supported by the target resource. For example, an API may not allow calling DELETE to remove a resource.',
    406 : "Not Acceptable\nThis response is sent when the web server, after performing server-driven content negotiation, doesn't find any content that conforms to the criteria given by the user agent.",
    407 : 'Proxy Authentication Required\nThis is similar to 401 Unauthorized but authentication is needed to be done by a proxy.',
    408 : 'Request Timeout\nThis response is sent on an idle connection by some servers, even without any previous request by the client. It means that the server would like to shut down this unused connection. This response is used much more since some browsers, like Chrome, Firefox 27+, or IE9, use HTTP pre-connection mechanisms to speed up surfing. Also note that some servers merely shut down the connection without sending this message.',
    409 : 'Conflict\nThis response is sent when a request conflicts with the current state of the server.',
    410 : 'Gone\nThis response is sent when the requested content has been permanently deleted from server, with no forwarding address. Clients are expected to remove their caches and links to the resource. The HTTP specification intends this status code to be used for "limited-time, promotional services". APIs should not feel compelled to indicate resources that have been deleted with this status code.',
    411 : 'Length Required\nServer rejected the request because the Content-Length header field is not defined and the server requires it.',
    412 : 'Precondition Failed\nThe client has indicated preconditions in its headers which the server does not meet.',
    413 : 'Payload Too Large\nRequest entity is larger than limits defined by server. The server might close the connection or return an Retry-After header field.',
    414 : 'URI Too Long\nThe URI requested by the client is longer than the server is willing to interpret.',
    415 : 'Unsupported Media Type\nThe media format of the requested data is not supported by the server, so the server is rejecting the request.',
    416 : "Range Not Satisfiable\nThe range specified by the Range header field in the request cannot be fulfilled. It's possible that the range is outside the size of the target URI's data.",
    417 : 'Expectation Failed\nThis response code means the expectation indicated by the Expect request header field cannot be met by the server.',
    418 : "I'm a teapot\nThe server refuses the attempt to brew coffee with a teapot.",
    421 : 'Misdirected Request\nThe request was directed at a server that is not able to produce a response. This can be sent by a server that is not configured to produce responses for the combination of scheme and authority that are included in the request URI.',
    422 : 'Unprocessable Entity (WebDAV)\nThe request was well-formed but was unable to be followed due to semantic errors.',
    423 : 'Locked (WebDAV)\nThe resource that is being accessed is locked.',
    424 : 'Failed Dependency (WebDAV)\nThe request failed due to failure of a previous request.',
    425 : 'Too Early Experimental\nIndicates that the server is unwilling to risk processing a request that might be replayed.',
    426 : 'Upgrade Required\nThe server refuses to perform the request using the current protocol but might be willing to do so after the client upgrades to a different protocol. The server sends an Upgrade header in a 426 response to indicate the required protocol(s).',
    428 : "Precondition Required\nThe origin server requires the request to be conditional. This response is intended to prevent the 'lost update' problem, where a client GETs a resource's state, modifies it and PUTs it back to the server, when meanwhile a third party has modified the state on the server, leading to a conflict.",
    429 : 'Too Many Requests\nThe user has sent too many requests in a given amount of time ("rate limiting").',
    431 : 'Request Header Fields Too Large\nThe server is unwilling to process the request because its header fields are too large. The request may be resubmitted after reducing the size of the request header fields.',
    451 : 'Unavailable For Legal Reasons\nThe user agent requested a resource that cannot legally be provided, such as a web page censored by a government.',
    500 : 'Internal Server Error\nThe server has encountered a situation it does not know how to handle.',
    501 : 'Not Implemented\nThe request method is not supported by the server and cannot be handled. The only methods that servers are required to support (and therefore that must not return this code) are GET and HEAD.',
    502 : 'Bad Gateway\nThis error response means that the server, while working as a gateway to get a response needed to handle the request, got an invalid response.',
    503 : 'Service Unavailable\nThe server is not ready to handle the request. Common causes are a server that is down for maintenance or that is overloaded. Note that together with this response, a user-friendly page explaining the problem should be sent. This response should be used for temporary conditions and the Retry-After HTTP header should, if possible, contain the estimated time before the recovery of the service. The webmaster must also take care about the caching-related headers that are sent along with this response, as these temporary condition responses should usually not be cached.',
    504 : 'Gateway Timeout\nThis error response is given when the server is acting as a gateway and cannot get a response in time.',
    505 : 'HTTP Version Not Supported\nThe HTTP version used in the request is not supported by the server.',
    506 : 'Variant Also Negotiates\nThe server has an internal configuration error: the chosen variant resource is configured to engage in transparent content negotiation itself, and is therefore not a proper end point in the negotiation process.',
    507 : 'Insufficient Storage (WebDAV)\nThe method could not be performed on the resource because the server is unable to store the representation needed to successfully complete the request.',
    508 : 'Loop Detected (WebDAV)\nThe server detected an infinite loop while processing the request.',
    510 : 'Not Extended\nFurther extensions to the request are required for the server to fulfill it.',
    511 : 'Network Authentication Required\nIndicates that the client needs to authenticate to gain network access.'
}

# 3. Các hàm hỗ trợ:

**3.1. Ghi từ điển vào file json:**

In [7]:
def writeDictToJSONFile(filePath, dict_data):
    with open(filePath, "w") as f:
        f.write(json.dumps(dict_data, indent=4))

**3.2. Append data to json file:**

In [8]:
def appendToJSONFile(filePath, data, appendType='append', appendKeyChain=[]):
    success = True
    support_types = ['append', 'extend']
    if appendType not in support_types:
        success = False
        print(f'appendToJSONFile: appendType is not in {support_types}')
    else:
        try:
            with open(filePath,'r+') as file:
                # Đọc dữ liệu hiện có trong file
                file_data = json.load(file)
                
                # Dynamically Evaluate Expressions in Python
                expression = ""
                for key in appendKeyChain:
                    expression = f'{expression}[{key}]'
                expression = f'file_data{expression}.{appendType}(data)'
                eval(expression)
                print('New data length: ', len(file_data))
                        
                # Ghi dữ liệu mới vào file
                file.seek(0)
                json.dump(file_data, file, indent = 4)
            success = True
        except:
            success = False
            print(f"appendToJSONFile: can't write to file")

    return success

**3.3. In từ điển thành các dòng 'key' : 'value':**

In [9]:
def printDict(dict_data):
    for key in dict_data.keys():
        print(key, ' : ', dict_data[key], '\n')

**3.4. Hàm gọi API:**

In [10]:
def getData(str_rootURL, dict_headers, dict_parameters):
    session = Session()
    session.headers.update(dict_headers)
    
    status_code = None
    data = None
    
    try:
        response = session.get(str_rootURL, params=dict_parameters)
        data = json.loads(response.text)
        status_code = response.status_code
    except (ConnectionError, Timeout, TooManyRedirects) as e:
        status_code = response.status_code
        print(e)

    return {
        'status_code' : status_code,
        'json_data' : data
    }

**3.5. Chuyển ngày giờ dạng chuỗi thành UTC timestamp:**

In [11]:
def datetimeStringToUTCTimestamp(str_datetime, datetime_format):
    timestamp = datetime.datetime.strptime(str_datetime, datetime_format)
    timestamp = int(timestamp.replace(tzinfo=timezone.utc).timestamp())
    return timestamp

**3.6. Cộng nDayToAdd ngày (chấp nhận mọi số thực) vào một ngày giờ dạng chuỗi cho trước:**

In [12]:
def addDayToDateString(str_date, nDayToAdd, datetime_format):
    nDay = timedelta(days=nDayToAdd)
    result = datetime.datetime.strptime(str_date, datetime_format) + nDay
    result = datetime.datetime.strftime(result, datetime_format)
    return result

**3.7. Phân lô một danh sách thành các lô có nElemPerBatch phần tử:**

In [13]:
def getBatchList(list_dataToBatch, nElemPerBatch):
    batchList = []
    
    filled = 0
    nElem = len(list_dataToBatch)
    
    if nElem % nElemPerBatch != 0:
        batchList.append(list_dataToBatch[0 : nElem % nElemPerBatch])
        filled = nElem % nElemPerBatch

    for i in range(0, nElem - filled, nElemPerBatch):
        batchList.append(list_dataToBatch[
            filled + i : 
            filled + i + nElemPerBatch
        ])
    return batchList

# 4. Thu thập dữ liệu:

**4.1. CoinMarketCap ID Map:**

- [Documentation](https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyMap)
- Returns a mapping of all cryptocurrencies to unique CoinMarketCap ids.

4.1.1. API query parameters:

- Phần này chỉ để ghi chú vào file, không truy cập trong code.

In [14]:
coinmarketcap_id_map_query_parameters = {
    'listing_status' : 'string\nDefault: "active"\nOnly active cryptocurrencies are returned by default. Pass inactive to get a list of cryptocurrencies that are no longer active. Pass untracked to get a list of cryptocurrencies that are listed but do not yet meet methodology requirements to have tracked markets available. You may pass one or more comma-separated values.',
    'start' : 'integer >= 1\nDefault: 1\nOptionally offset the start (1-based index) of the paginated list of items to return.',
    'limit' : 'integer [ 1 .. 5000 ]\nOptionally specify the number of results to return. Use this parameter and the "start" parameter to determine your own pagination size.',
    'sort' : 'string\nDefault: "id"\n"cmc_rank" "id"\nWhat field to sort the list of cryptocurrencies by.',
    'symbol' : 'string\nOptionally pass a comma-separated list of cryptocurrency symbols to return CoinMarketCap IDs for. If this option is passed, other options will be ignored.',
    'aux' : 'string\nDefault: "platform,first_historical_data,last_historical_data,is_active"\nOptionally specify a comma-separated list of supplemental data fields to return. Pass platform,first_historical_data,last_historical_data,is_active,status to include all auxiliary fields.'
}

filename = 'coinmarketcap-id-map-query-parameters.json'
filePath = f"{dataPath}/{filename}"
writeDictToJSONFile(filePath, coinmarketcap_id_map_query_parameters)

**4.1. CoinMarketCap ID Map:**

- [Documentation](https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyMap)
- Returns a mapping of all cryptocurrencies to unique CoinMarketCap ids.

4.1.1. API query parameters:

4.1.2. Data Schema:
- Phần này chỉ để ghi chú vào file, không truy cập trong code.

In [15]:
coinmarketcap_id_map_schema = {
    'id' : 'integer\nThe unique cryptocurrency ID for this cryptocurrency.',
    'name' : 'string\nThe name of this cryptocurrency.',
    'symbol' : 'string\nThe ticker symbol for this cryptocurrency, always in all caps.',
    'slug' : 'string\nThe web URL friendly shorthand version of this cryptocurrency name.',
    'is_active' : 'integer [ 0 .. 1 ]\n1 if this cryptocurrency has at least 1 active market currently being tracked by the platform, otherwise 0. A value of 1 is analogous with listing_status=active.',
    'status' : 'string\n"active" "inactive" "untracked"\nThe listing status of the cryptocurrency. This field is only returned if requested through the aux request parameter.',
    'first_historical_data' : 'string <date>\nTimestamp (ISO 8601) of the date this cryptocurrency was first available on the platform.',
    'last_historical_data' : "string <date>\nTimestamp (ISO 8601) of the last time this cryptocurrency's market data was updated.",
    'platform' : [
        'Metadata about the parent cryptocurrency platform this cryptocurrency belongs to if it is a token, otherwise null.',
        {
            'id' : 'integer\nThe unique CoinMarketCap ID for the parent platform cryptocurrency.',
            'name' : 'string\nThe name of the parent platform cryptocurrency.',
            'symbol' : 'string\nThe ticker symbol for the parent platform cryptocurrency.',
            'slug' : 'string\nThe web URL friendly shorthand version of the parent platform cryptocurrency name.',
            'token_address' : 'string\nThe token address on the parent platform cryptocurrency.'
        }
    ]
}

filename = 'coinmarketcap-id-map-schema.json'
filePath = f"{dataPath}/{filename}"
writeDictToJSONFile(filePath, coinmarketcap_id_map_schema)

**4.1. CoinMarketCap ID Map:**

- [Documentation](https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyMap)
- Returns a mapping of all cryptocurrencies to unique CoinMarketCap ids.

4.1.1. API query parameters:

4.1.2. Data Schema:

4.1.3. Lấy dữ liệu từ API:

In [16]:

url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/map'

parameters = {
    'listing_status' : 'active', # Only active cryptocurrencies are returned
    'sort' : 'id', # What field to sort the list of cryptocurrencies by
}

nCoin = 1
filename = 'coinmarketcap-id-map.json'

response = getData(url, headers, parameters)
if response['status_code'] == 200:
    filePath = f"{dataPath}/{filename}"
    writeDictToJSONFile(
        filePath,
        response['json_data']['data']
    )

    nCoin = len(response['json_data']['data'])
    print('The number of coins:', nCoin)
    print(filePath)
else:
    status_code = response['status_code']
    message = http_status_code[status_code] if status_code in http_status_code.keys() else 'Unknown'
    print(f"{status_code} : {message}")

df = pd.read_json(f'{dataPath}/coinmarketcap-id-map.json')
print(df.shape)
df.head()

The number of coins: 8979
/Users/4rr311/Documents/VectorA/KHTN/Nam3/NMKHDL/Project01/Project-1/crawl-data/data/coinmarketcap-id-map.json
(8979, 10)


Unnamed: 0,id,name,symbol,slug,rank,displayTV,is_active,first_historical_data,last_historical_data,platform
0,1,Bitcoin,BTC,bitcoin,1,1,1,2013-04-28T18:47:21.000Z,2022-12-14T06:39:00.000Z,
1,2,Litecoin,LTC,litecoin,13,1,1,2013-04-28T18:47:22.000Z,2022-12-14T06:39:00.000Z,
2,3,Namecoin,NMC,namecoin,611,1,1,2013-04-28T18:47:22.000Z,2022-12-14T06:39:00.000Z,
3,4,Terracoin,TRC,terracoin,1665,1,1,2013-04-28T18:47:22.000Z,2022-12-14T06:39:00.000Z,
4,5,Peercoin,PPC,peercoin,697,1,1,2013-04-28T18:47:23.000Z,2022-12-14T06:39:00.000Z,


**4.2. Listings Latest:**

- [Documentation](https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyListingsLatest)
- Returns a paginated list of all active cryptocurrencies with latest market data.

4.2.1. API query parameters:
- Phần này chỉ để ghi chú vào file, không truy cập trong code.

In [17]:
listings_latest_query_parameters = {
    'start' : 'integer >= 1\nDefault: 1\nOptionally offset the start (1-based index) of the paginated list of items to return.',
    'limit' : 'integer [ 1 .. 5000 ]\nDefault: 100\nOptionally specify the number of results to return. Use this parameter and the "start" parameter to determine your own pagination size.',
    'price_min' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of minimum USD price to filter results by.',
    'price_max' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of maximum USD price to filter results by.',
    'market_cap_min' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of minimum market cap to filter results by.',
    'market_cap_max' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of maximum market cap to filter results by.',
    'volume_24h_min' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of minimum 24 hour USD volume to filter results by.',
    'volume_24h_max' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of maximum 24 hour USD volume to filter results by.',
    'circulating_supply_min' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of minimum circulating supply to filter results by.',
    'circulating_supply_max' : 'number [ 0 .. 100000000000000000 ]\nOptionally specify a threshold of maximum circulating supply to filter results by.',
    'percent_change_24h_min' : 'number >= -100\nOptionally specify a threshold of minimum 24 hour percent change to filter results by.',
    'percent_change_24h_max' : 'number >= -100\nOptionally specify a threshold of maximum 24 hour percent change to filter results by.',
    'convert' : 'string\nOptionally calculate market quotes in up to 120 currencies at once by passing a comma-separated list of cryptocurrency or fiat currency symbols. Each additional convert option beyond the first requires an additional call credit. A list of supported fiat options can be found here. Each conversion is returned in its own "quote" object.',
    'convert_id' : 'string\nOptionally calculate market quotes by CoinMarketCap ID instead of symbol. This option is identical to convert outside of ID format. Ex: convert_id=1,2781 would replace convert=BTC,USD in your query. This parameter cannot be used when convert is used.',
    'sort' : 'string\n"Default: market_cap"\n"name" "symbol" "date_added" "market_cap" "market_cap_strict" "price" "circulating_supply" "total_supply" "max_supply" "num_market_pairs" "volume_24h" "percent_change_1h" "percent_change_24h" "percent_change_7d" "market_cap_by_total_supply_strict" "volume_7d" "volume_30d"\nWhat field to sort the list of cryptocurrencies by.',
    'sort_dir' : 'string\nValid values: "asc" "desc"\nThe direction in which to order cryptocurrencies against the specified sort.',
    'cryptocurrency_type' : 'string\nDefault: "all"\n"all" "coins" "tokens"\nThe type of cryptocurrency to include.',
    'tag' : 'string\nDefault: "all"\n"all" "defi" "filesharing"\nThe tag of cryptocurrency to include.',
    'aux' : 'string\n"num_market_pairs,cmc_rank,date_added,tags,platform,max_supply,circulating_supply,total_supply"\nOptionally specify a comma-separated list of supplemental data fields to return. Pass num_market_pairs,cmc_rank,date_added,tags,platform,max_supply,circulating_supply,total_supply,market_cap_by_total_supply,volume_24h_reported,volume_7d,volume_7d_reported,volume_30d,volume_30d_reported,is_market_cap_included_in_calc to include all auxiliary fields.'
}

filename = 'listings-latest-query-parameters.json'
filePath = f"{dataPath}/{filename}"
writeDictToJSONFile(filePath, listings_latest_query_parameters)

**4.2. Listings Latest:**

- [Documentation](https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyListingsLatest)
- Returns a paginated list of all active cryptocurrencies with latest market data.

4.2.1. API query parameters:

4.2.2. Data Schema:
- Phần này chỉ để ghi chú vào file, không truy cập trong code.

In [18]:
listings_latest_schema = {
    'id' : 'integer\nThe unique CoinMarketCap ID for this cryptocurrency.',
    'name' : 'string\nThe name of this cryptocurrency.',
    'symbol' : 'string\nThe ticker symbol for this cryptocurrency.',
    'slug' : 'string\nThe web URL friendly shorthand version of this cryptocurrency name.',
    'cmc_rank' : "integer\nThe cryptocurrency's CoinMarketCap rank by market cap.",
    'num_market_pairs' : 'integer\nThe number of active trading pairs available for this cryptocurrency across supported exchanges.',
    'circulating_supply' : 'number\nThe approximate number of coins circulating for this cryptocurrency.',
    'total_supply' : 'number\nThe approximate total amount of coins in existence right now (minus any coins that have been verifiably burned).',
    'market_cap_by_total_supply' : 'number\nThe market cap by total supply. This field is only returned if requested through the aux request parameter.',
    'max_supply' : 'number\nThe expected maximum limit of coins ever to be available for this cryptocurrency.',
    'last_updated' : "string <date>\nTimestamp (ISO 8601) of the last time this cryptocurrency's market data was updated.",
    'date_added' : 'string <date>\nTimestamp (ISO 8601) of when this cryptocurrency was added to CoinMarketCap.',
    'tags' : 'Array of tags associated with this cryptocurrency. Currently only a mineable tag will be returned if the cryptocurrency is mineable. Additional tags will be returned in the future.',
    'self_reported_circulating_supply' : 'number\nThe self reported number of coins circulating for this cryptocurrency.',
    'self_reported_market_cap' : 'number\nThe self reported market cap for this cryptocurrency.',
    'tvl_ratio' : 'number\nPercentage of Total Value Locked',
    'platform' : [
        'Metadata about the parent cryptocurrency platform this cryptocurrency belongs to if it is a token, otherwise null.',
        {
            'id' : 'integer\nThe unique CoinMarketCap ID for the parent platform cryptocurrency.',
            'name' : 'string\nThe name of the parent platform cryptocurrency.',
            'symbol' : 'string\nThe ticker symbol for the parent platform cryptocurrency.',
            'slug' : 'string\nThe web URL friendly shorthand version of the parent platform cryptocurrency name.',
            'token_address' : 'string\nThe token address on the parent platform cryptocurrency.',
        }
    ],
    'quote' : [
        'A map of market quotes in different currency conversions. The default map included is USD.',
        {
            '$key' : [
                'A market quote in the currency conversion option.',
                {
                    'price' : 'number\nPrice in the specified currency for this historical.',
                    'volume_24h' : 'number\nRolling 24 hour adjusted volume in the specified currency.',
                    'volume_change_24h' : 'number\n24 hour change in the specified currencies volume.',
                    'volume_24h_reported' : 'number\nRolling 24 hour reported volume in the specified currency. This field is only returned if requested through the aux request parameter.',
                    'volume_7d' : 'number\nRolling 7 day adjusted volume in the specified currency. This field is only returned if requested through the aux request parameter.',
                    'volume_7d_reported' : 'number\nRolling 7 day reported volume in the specified currency. This field is only returned if requested through the aux request parameter.',
                    'volume_30d' : 'number\nRolling 30 day adjusted volume in the specified currency. This field is only returned if requested through the aux request parameter.',
                    'volume_30d_reported' : 'number\nRolling 30 day reported volume in the specified currency. This field is only returned if requested through the aux request parameter.',
                    'market_cap' : 'number\nMarket cap in the specified currency.',
                    'market_cap_dominance' : 'number\nMarket cap dominance in the specified currency.',
                    'fully_diluted_market_cap' : 'number\nFully diluted market cap in the specified currency.',
                    'tvl' : 'number\nTotal Value Locked',
                    'percent_change_1h' : 'number\n1 hour change in the specified currency.',
                    'percent_change_24h' : 'number\n24 hour change in the specified currency.',
                    'percent_change_7d' : 'number\n7 day change in the specified currency.',
                    'last_updated' : "string <date>\nTimestamp (ISO 8601) of when the conversion currency's current value was referenced.",
                }
            ]
        }
    ]
}

filename = 'listings-latest-schema.json'
filePath = f"{dataPath}/{filename}"
writeDictToJSONFile(filePath, listings_latest_schema)

**4.2. Listings Latest:**

- [Documentation](https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyListingsLatest)
- Returns a paginated list of all active cryptocurrencies with latest market data.

4.2.1. API query parameters:

4.2.2. Data Schema:

4.2.3. Lấy dữ liệu từ API:

In [19]:
url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'

nCoin = 0
with open(f'{dataPath}/coinmarketcap-id-map.json') as f:
    nCoin = len(json.load(f))

limit = 5000
start_limit_map = []

if nCoin % limit != 0:
    start_limit_map.append({
        'start' : 1,
        'limit' : nCoin % limit
    })
    nCoin = nCoin - nCoin % limit

for i in range(len(start_limit_map) + 1, nCoin + len(start_limit_map), limit):
    start_limit_map.append({
        'start' : i,
        'limit' : limit
    })

filename = 'listings-latest.json'
filePath = f"{dataPath}/{filename}"

with open(filePath, 'w') as file:
    json.dump(json.loads('[]'), file, indent=4)

for pair in start_limit_map:
    parameters = {
        'start' : pair['start'],
        'limit' : pair['limit']
    }

    response = getData(url, headers, parameters)

    if response['status_code'] == 200:
        appendToJSONFile(
            filePath,
            response['json_data']['data'],
            appendType='extend'
        )

        print('The number of coins appended:', len(response['json_data']['data']))
        print(filePath)
    else:
        status_code = response['status_code']
        message = http_status_code[status_code] if status_code in http_status_code.keys() else 'Unknown'
        print(f"{status_code} : {message}")

df = pd.read_json(f'{dataPath}/listings-latest.json')
print(df.shape)
df.head()

New data length:  3979
The number of coins appended: 3979
/Users/4rr311/Documents/VectorA/KHTN/Nam3/NMKHDL/Project01/Project-1/crawl-data/data/listings-latest.json
New data length:  8979
The number of coins appended: 5000
/Users/4rr311/Documents/VectorA/KHTN/Nam3/NMKHDL/Project01/Project-1/crawl-data/data/listings-latest.json
(8979, 17)


Unnamed: 0,id,name,symbol,slug,num_market_pairs,date_added,tags,max_supply,circulating_supply,total_supply,platform,cmc_rank,self_reported_circulating_supply,self_reported_market_cap,tvl_ratio,last_updated,quote
0,1,Bitcoin,BTC,bitcoin,9897,2013-04-28T00:00:00.000Z,"[mineable, pow, sha-256, store-of-value, state...",21000000.0,19233410.0,19233410.0,,1,,,,2022-12-14T06:45:00.000Z,"{'USD': {'price': 17779.233487116337, 'volume_..."
1,1027,Ethereum,ETH,ethereum,6288,2015-08-07T00:00:00.000Z,"[pos, smart-contracts, ethereum-ecosystem, coi...",,122373900.0,122373900.0,,2,,,,2022-12-14T06:45:00.000Z,"{'USD': {'price': 1320.9851695450443, 'volume_..."
2,825,Tether,USDT,tether,44720,2015-02-25T00:00:00.000Z,"[payments, stablecoin, asset-backed-stablecoin...",,65792060000.0,73141770000.0,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ET...",3,,,,2022-12-14T06:45:00.000Z,"{'USD': {'price': 0.9999928850042591, 'volume_..."
3,3408,USD Coin,USDC,usd-coin,8985,2018-10-08T00:00:00.000Z,"[medium-of-exchange, stablecoin, asset-backed-...",,45038720000.0,45038720000.0,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ET...",4,,,,2022-12-14T06:45:00.000Z,"{'USD': {'price': 1.000067297253004, 'volume_2..."
4,1839,BNB,BNB,bnb,1153,2017-07-25T00:00:00.000Z,"[marketplace, centralized-exchange, payments, ...",200000000.0,159967900.0,159980000.0,,5,,,,2022-12-14T06:45:00.000Z,"{'USD': {'price': 275.22462870775934, 'volume_..."


**4.3. Coin Historical Data:**
- Dữ liệu này không có API chính thức cho tài khoản CoinMarketCap free.
- Xem thể hiện của dữ liệu khi đưa lên web [tại đây](https://coinmarketcap.com/currencies/bitcoin/historical-data/).
- API được nhận thấy và lấy từ việc phân tích các gói tin mà trang web gửi trong quá trình load trang web.

4.3.1. Root URL có dạng: https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical

4.3.2. Query Parameters:
- id        : [coin-id]
- convertId : 2781 (Fiat Money. Theo tài liệu API, [tiền định danh](https://coinmarketcap.com/api/documentation/v1/#tag/fiat) mã 2781 là đồng USD)
- timeStart : [UTC timestamp]
- timeEnd   : [UTC timestamp]

4.3.3. Ví dụ một cách gọi API: https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1661817600&timeEnd=1664409600

4.3.4. Các hàm hỗ trợ:
- 4.3.4.1. Hàm lấy danh sách toàn bộ active coin id:

In [20]:
def getCoinIDList():
    idList = []
    
    id_map_path = f'{dataPath}/coinmarketcap-id-map.json'
    id_map = []

    with open(id_map_path, 'r') as f:
        id_map = json.load(f)

    for i in id_map:
        idList.append(i['id'])
    
    return idList

- 4.3.4.2. Hàm lấy danh sách URLs 12 tháng trong 1 năm của một coin id:

In [21]:
def getHistoricalURLs(coinID, year):
    url_list = []
    rootURL = 'https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical'
    for i in range(1, 13):
        str_timeStart = f'{year}-{0 if i < 10 else ""}{i}-01 00:00:00'
        str_timeEnd = f'{year}-{0 if i + 1 < 10 else ""}{i + 1}-01 00:00:00'
        
        if i == 12:
            str_timeEnd = f'{year + 1}-01-01 00:00:00'
        str_timeEnd = addDayToDateString(str_timeEnd, -1, datetime_format)

        timeStart = datetimeStringToUTCTimestamp(str_timeStart, datetime_format)
        timeEnd = datetimeStringToUTCTimestamp(str_timeEnd, datetime_format)
        
        url = f'{rootURL}?id={coinID}&convertId={2781}&timeStart={timeStart}&timeEnd={timeEnd}'
        url_list.append(
            {
                'url' : url,
                'str_startTime' : str_timeStart,
                'str_endTime' : str_timeEnd
            }
        )
    return url_list

- 4.3.4.3. Hàm lấy lịch sử từ một url và ghi vào file ở đường dẫn coinIDPath:

In [22]:
def getHistory(urlInfo, coinIDPath):
    filename = f"{urlInfo['str_startTime'].split(' ')[0]}.json"
    filepath = f'{coinIDPath}/{filename}'

    # print(f"Getting   {urlInfo['url']}")
    try:
        session = HTMLSession()
        r = session.get(urlInfo['url'])
        with open(filepath, 'w') as file:
            json.dump(r.json()['data'], file, indent=4)
    except:
        # print(f"Can't get {urlInfo['url']}")
        pass

- 4.3.4.4. Hàm lấy danh sách các coin id bị thiếu dữ liệu do 'rate limiting' lúc crawl trong một năm:

In [23]:
def getUncompletedCoinID(year):
    historicalDataPath = f'{dataPath}/historical-data/{year}'
    p = Path(historicalDataPath)
    folders = [f for f in p.iterdir() if f.is_dir()]
    uncompletedCoinID = set({})
    for folder in folders:
        files = folder.iterdir()
        for file in files:
            fileSize = os.path.getsize(file)
            if fileSize == 0:
            # if fileSize <= 5:
                filepath = str(file)
                # print(filepath)
                coin_id_index = filepath.index('coin-id-')
                filepath = filepath[coin_id_index:-(len('yyyy-mm-dd.json') + 1)]
                uncompletedCoinID.add(filepath.split('-')[2])
    return list(uncompletedCoinID)

- 4.3.4.5. Hàm cập nhật file danh sách các coin id bị thiếu dữ liệu do 'rate limiting' lúc crawl trong một năm:

In [24]:
def writeUncompletedCoinID(year):
    historicalDataPath = f'{dataPath}/historical-data/{year}'
    filename = 'uncompleted-coin-ids.json'
    with open(f'{historicalDataPath}/{filename}', 'w') as file:
        json.dump(getUncompletedCoinID(year), file)

- 4.3.4.6. Hàm lấy lịch sử các id trong idList của một năm và ghi vào đường dẫn tương ứng:

In [25]:
# session = HTMLSession()
def getCoinHistoricalData(idList, year, rateLimiting=False, sleepTime=0, idPerPeriod=160):
    historicalDataPath = f'{dataPath}/historical-data/{year}'
    countID = 0
    for coinID in idList:
        coinIDPath = f'{historicalDataPath}/coin-id-{coinID}'
        createPathIfNotExist(coinIDPath)

        nURLPerBatch = 7
        url_list = getHistoricalURLs(coinID, year)
        urlBatchList = getBatchList(url_list, nURLPerBatch)
        

        # Tránh rate limiting do request quá nhiều
        if (countID > idPerPeriod and rateLimiting):
            uncompletedCoinIDs = getUncompletedCoinID(year)
            writeUncompletedCoinID(year)
            print('Số coin id bị thiếu dữ liệu:', len(uncompletedCoinIDs))
            time.sleep(sleepTime)
            countID = 0

        countMonth = 0
        countID = countID + 1
        
        for urlBatch in urlBatchList:
            threadBatch = []
            for urlInfo in urlBatch:
                countMonth = countMonth + 1
                print(countID, countMonth, urlInfo['url'])

                threadBatch.insert(
                    0, 
                    threading.Thread(target=getHistory, args=(urlInfo, coinIDPath, ))
                )

                threadBatch[0].start()
                
            for i in range(0, len(threadBatch)):
                threadBatch[i].join()
            # time.sleep(60)

**4.3. Coin Historical Data:**

4.3.4. Các hàm hỗ trợ:

4.3.5. Lấy Coin Historical Data:

Các cell được comment lại có thể không cần chạy do thời gian chạy khá lâu nhưng dữ liệu lại không phục vụ cho quá trình phân tích:

- 4.3.5.1. Lấy dữ liệu toàn bộ coin id trong năm 2021:

In [26]:
# year = 2021
# sleepTime = 115 # Thời gian sleep giữa các đợt crawl
# idPerPeriod = 165

# idList = getCoinIDList()
# getCoinHistoricalData(
#     idList, year,
#     rateLimiting=True, sleepTime=sleepTime,
#     idPerPeriod=idPerPeriod
# )

Thực hiện bước bên dưới cho đến khi không còn coin id nào bị thiếu dữ liệu, hoặc đến khi chỉ còn ít hơn 10 coin id bị thiếu dữ liệu:
- 4.3.5.2. Lấy dữ liệu còn thiếu của các coin id trong năm 2021:

In [27]:
# year = 2021
# uncompletedCoinIDs = getUncompletedCoinID(year)
# writeUncompletedCoinID(year)
# print('Số coin id bị thiếu dữ liệu:', len(uncompletedCoinIDs))

# sleepTime = 115 # Thời gian sleep giữa các đợt crawl
# idPerPeriod = 165
# getCoinHistoricalData(
#     uncompletedCoinIDs, year, 
#     rateLimiting=True, sleepTime=sleepTime,
#     idPerPeriod=idPerPeriod
# )

- 4.3.5.3. Lấy dữ liệu của BitCoin các năm 2013 đến 2021:

In [28]:
sleepTime = 115 # Thời gian sleep giữa các đợt crawl
idPerPeriod = 165

for year in list(range(2013, 2021 + 1)):
    getCoinHistoricalData(
        [1], year, 
        rateLimiting=True, sleepTime=sleepTime,
        idPerPeriod=idPerPeriod
    )

1 1 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1356998400&timeEnd=1359590400
1 2 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1359676800&timeEnd=1362009600
1 3 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1362096000&timeEnd=1364688000
1 4 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1364774400&timeEnd=1367280000
1 5 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1367366400&timeEnd=1369958400
1 6 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1370044800&timeEnd=1372550400
1 7 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1372636800&timeEnd=1375228800
1 8 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1

- 4.3.5.4. Lấy thêm dữ liệu của BitCoin năm 2022:

In [29]:
sleepTime = 115 # Thời gian sleep giữa các đợt crawl
idPerPeriod = 165

for year in [2022]:
    getCoinHistoricalData(
        [1], year, 
        rateLimiting=True, sleepTime=sleepTime,
        idPerPeriod=idPerPeriod
    )

1 1 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1640995200&timeEnd=1643587200
1 2 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1643673600&timeEnd=1646006400
1 3 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1646092800&timeEnd=1648684800
1 4 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1648771200&timeEnd=1651276800
1 5 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1651363200&timeEnd=1653955200
1 6 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1654041600&timeEnd=1656547200
1 7 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1&convertId=2781&timeStart=1656633600&timeEnd=1659225600
1 8 https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?id=1