In [25]:
import requests
import pandas as pd
from dataclasses import dataclass
main_url = 'https://vnprovinces.pythonanywhere.com/api/'
province_url = main_url + 'provinces/'
district_url = main_url + 'districts/'
ward_url = main_url + 'wards/'

@dataclass
class Province:
    province_id: int
    province_name: str
    province_name_en: str
    type: str

@dataclass
class District:
    district_id: int
    province_id: int
    district_name: str
    district_name_en: str
    type: str

@dataclass
class Ward:
    ward_id: int
    district_id: int
    province_id: int
    ward_name: str
    ward_name_en: str
    type: str

## Crawl province

In [18]:
params = {
    'limit': 100
}
request = requests.get(province_url, params=params).json()
province_json = request['results']
province_df = pd.DataFrame(province_json)
province_df = province_df[['id', 'name', 'name_en', 'type']]
province_df.rename(columns={'id': 'province_id', 'name': 'province_name', 'name_en': 'province_name_en'}, inplace=True)
def change_province_type(type):
        return 'province' if type == 'P' else 'district'
province_df['type'] = province_df['type'].apply(change_province_type)
province_df

Unnamed: 0,province_id,province_name,province_name_en,type
0,89,An Giang,An Giang,province
1,77,Bà Rịa - Vũng Tàu,Ba Ria - Vung Tau,province
2,24,Bắc Giang,Bac Giang,province
3,6,Bắc Kạn,Bac Kan,province
4,95,Bạc Liêu,Bac Lieu,province
...,...,...,...,...
58,66,Đắk Lắk,Dak Lak,province
59,67,Đắk Nông,Dak Nong,province
60,11,Điện Biên,Dien Bien,province
61,75,Đồng Nai,Dong Nai,province


In [49]:
province_df.to_csv('province.csv', index=False)

## Crawl district

In [42]:
def pre_processing_district(district):
    if len(district['name']) == 1:
        district['name'] = district['full_name']
    district['province_id'] = district['province']['id']
    if district['type'] == 'C':
        district['type'] = 'city'
    elif district['type'] == 'T':
        district['type'] = 'town'
    elif district['type'] == 'UD':
        district['type'] = 'urban'
    else:
        district['type'] = 'rural'
    return district
district_dfs = []
next = True
index = 1
while next is not None:
    params = {
        'limit': 100,
        'page': index
    }
    request = requests.get(district_url, params=params)
    next = request.json()['next']
    district_json = request.json()['results']
    district_json = list(map(pre_processing_district, district_json))
    district_df = pd.DataFrame(district_json)
    district_dfs.append(district_df)
    index += 1
district_df = pd.concat(district_dfs)
district_df = district_df[['id', 'province_id', 'full_name', 'name_en', 'type']]
district_df.rename(columns={'id': 'district_id', 'full_name': 'district_name', 'name_en': 'district_name_en'}, inplace=True)
district_df

In [50]:
district_df.to_csv('district.csv', index=False)

## Crawl ward

In [47]:
def pre_processing_ward(ward):
    if len(ward['name']) == 1:
        ward['name'] = ward['full_name']
    ward['district_id'] = ward['district']['id']
    ward['province_id'] = ward['province']['id']
    if ward['type'] == 'W':
        ward['type'] = 'ward'
    elif ward['type'] == 'C':
        ward['type'] = 'commune'
    else:
        ward['type'] = 'town'
    return ward
next = True
index = 1
ward_dfs = []
while next is not None:
    params = {
        'limit': 100,
        'page': index
    }
    request = requests.get(ward_url, params=params)
    next = request.json()['next']
    ward_json = request.json()['results']
    ward_json = list(map(pre_processing_ward, ward_json))
    ward_df = pd.DataFrame(ward_json)
    ward_dfs.append(ward_df)
    index += 1

In [51]:
ward_df = pd.concat(ward_dfs)
ward_df = ward_df[['id', 'district_id', 'province_id', 'full_name', 'name_en', 'type']]
ward_df.rename(columns={'id': 'ward_id', 'full_name': 'ward_name', 'name_en': 'ward_name_en'}, inplace=True)
ward_df.to_csv('ward.csv', index=False)
