In [13]:
import pandas as pd
from datetime import datetime as dt
import requests
import unicodedata


# # csvのURL
# csv_url = "https://dl.dropboxusercontent.com/s/6mztoeb6xf78g5w/COVID-19.csv"

# try:
#     # urlを指定し、GETメソッドでhttpリクエスト
#     r = requests.get(csv_url)
#     # COVID-19.csvという名前で保存
#     with open("COVID-19.csv", "wb") as f:
#         f.write(r.content)
# except requests.exceptions.RequestException as err:
#     print(err)

In [14]:
# 利用するカラムの名前を指定
column_names = [
    # "通し",
    "年代",
    "性別",
    "確定日",
    "発症日",
    "受診都道府県",
    "居住都道府県",
    "X",
    "Y",
]

# カラム名を変更
changed_column_name = {
    # "通し": "id",
    "年代": "age",
    "性別": "gender",
    "確定日": "confirmed_date",
    "発症日": "onset_date",
    "受診都道府県": "examination_prefecture",
    "居住都道府県": "current_prefecture",
    "X": "lon",
    "Y": "lat",
}

In [15]:
# usecolsで利用するカラム名を配列形式指定
df = pd.read_csv("COVID-19.csv", usecols=column_names)
# 変更するカラム名を辞書形式で指定
rename_df = df.rename(columns=changed_column_name)
rename_df

Unnamed: 0,age,gender,confirmed_date,onset_date,examination_prefecture,current_prefecture,lon,lat
0,30,男性,1/15/2020,1/3/2020,神奈川県,神奈川県,139.642347,35.447504
1,40,男性,1/24/2020,1/14/2020,東京都,中華人民共和国,116.409685,39.903832
2,30,女性,1/25/2020,1/21/2020,東京都,中華人民共和国,116.409685,39.903832
3,40,男性,1/26/2020,1/23/2020,愛知県,中華人民共和国,116.409685,39.903832
4,40,男性,1/28/2020,1/22/2020,愛知県,中華人民共和国,116.409685,39.903832
...,...,...,...,...,...,...,...,...
133169,30,男性,11/30/2020,,長野県,長野県,138.412868,36.744750
133170,70,女性,11/30/2020,11/29/2020,長野県,長野県,138.440643,36.922904
133171,70,女性,11/30/2020,11/25/2020,長野県,長野県,137.851094,36.503017
133172,50,男性,11/30/2020,11/30/2020,長野県,長野県,137.851094,36.503017


In [16]:
from pytz import timezone
from datetime import datetime

In [17]:
rename_df["confirmed_date"][0]

'1/15/2020'

In [18]:
# localtz.localize(datetime.fromisoformat(rename_df["onset_date"][0].isoformat()))

In [19]:
import unicodedata
def add_time_zone(s: str, timezone=timezone('Asia/Tokyo')):
    s= s.strip()
    if not s:
        return ""
    return timezone.localize(datetime.fromisoformat(dt.strptime(s, "%m/%d/%Y").isoformat()))

def normalize_text(s: str):
    return unicodedata.normalize('NFKC', s.strip())

In [25]:
rename_df = df.rename(columns=changed_column_name)

rename_df.fillna("", inplace=True)

for column in rename_df.columns:
    if rename_df[column].dtype == object and isinstance(rename_df.iloc[0][column], str):
        rename_df[column] = rename_df[column].map(lambda x: normalize_text(x))

# ageカラムから指定した文字列を削除してリストを作成
rename_df["age"] = [str(string).strip(' ') for string in list(rename_df["age"])]
# 時刻をdate型に変換
rename_df["confirmed_date"] = rename_df["confirmed_date"].map(lambda x: add_time_zone(x))
# onset_dataのNaNを置換
rename_df.fillna({'onset_date': ''}, inplace=True)
rename_df["onset_date"] = rename_df["onset_date"].map(lambda x: add_time_zone(x))

rename_df['gender'] = rename_df['gender'].map(lambda x: x.replace('女|生', '女性'))

# 指定カラムの指定値を置換
# inplace=Trueで元のdfを変更
rename_df.replace(
    {
        "age": {"不明": "999", "": "999", "0-10": "10"}
    }, inplace=True)
rename_df["age"] = rename_df["age"].astype(int)

rename_df
# csvに書き出し

Unnamed: 0,age,gender,confirmed_date,onset_date,examination_prefecture,current_prefecture,lon,lat
0,30,男性,2020-01-15 00:00:00+09:00,2020-01-03 00:00:00+09:00,神奈川県,神奈川県,139.642347,35.447504
1,40,男性,2020-01-24 00:00:00+09:00,2020-01-14 00:00:00+09:00,東京都,中華人民共和国,116.409685,39.903832
2,30,女性,2020-01-25 00:00:00+09:00,2020-01-21 00:00:00+09:00,東京都,中華人民共和国,116.409685,39.903832
3,40,男性,2020-01-26 00:00:00+09:00,2020-01-23 00:00:00+09:00,愛知県,中華人民共和国,116.409685,39.903832
4,40,男性,2020-01-28 00:00:00+09:00,2020-01-22 00:00:00+09:00,愛知県,中華人民共和国,116.409685,39.903832
...,...,...,...,...,...,...,...,...
133169,30,男性,2020-11-30 00:00:00+09:00,,長野県,長野県,138.412868,36.744750
133170,70,女性,2020-11-30 00:00:00+09:00,2020-11-29 00:00:00+09:00,長野県,長野県,138.440643,36.922904
133171,70,女性,2020-11-30 00:00:00+09:00,2020-11-25 00:00:00+09:00,長野県,長野県,137.851094,36.503017
133172,50,男性,2020-11-30 00:00:00+09:00,2020-11-30 00:00:00+09:00,長野県,長野県,137.851094,36.503017


In [26]:
rename_df.to_csv("custom_COVID-19.csv", index=False)

In [27]:
set(rename_df['age'])

{10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 999}

In [23]:
# for column in rename_df.columns:
#     if rename_df[column].dtype == object and isinstance(rename_df.iloc[0][column], str):
#         print(column)