In [226]:
import csv
import datetime
import pandas as pd

In [227]:
DATASETS_PATH = "../data/newly_confirmed_cases_daily.csv"
PREFECTURES_CODE = {
    "北海道": 1,
    "青森県": 2,
    "岩手県": 3,
    "宮城県": 4,
    "秋田県": 5,
    "山形県": 6,
    "福島県": 7,
    "茨城県": 8,
    "栃木県": 9,
    "群馬県": 10,
    "埼玉県": 11,
    "千葉県": 12,
    "東京都": 13,
    "神奈川県": 14,
    "新潟県": 15,
    "富山県": 16,
    "石川県": 17,
    "福井県": 18,
    "山梨県": 19,
    "長野県": 20,
    "岐阜県": 21,
    "静岡県": 22,
    "愛知県": 23,
    "三重県": 24,
    "滋賀県": 25,
    "京都府": 26,
    "大阪府": 27,
    "兵庫県": 28,
    "奈良県": 29,
    "和歌山県": 30,
    "鳥取県": 31,
    "島根県": 32,
    "岡山県": 33,
    "広島県": 34,
    "山口県": 35,
    "徳島県": 36,
    "香川県": 37,
    "愛媛県": 38,
    "高知県": 39,
    "福岡県": 40,
    "佐賀県": 41,
    "長崎県": 42,
    "熊本県": 43,
    "大分県": 44,
    "宮崎県": 45,
    "鹿児島県": 46,
    "沖縄県": 47,
}
PREFECTURES = list(PREFECTURES_CODE.keys())

# データセットを扱いやすくする

現在のデータセットだと少し扱いづらいので、少し加工します。

- 都道府県コードで紐づけることができるように
- keyを日付からそれぞれの県の感染者数に変更する

## イメージ

| id | Prefecture_name | Prefecture_code | Infections | Date |
| -- | --------------- | --------------- | ---------- | ---- |

In [228]:
datasets = pd.read_csv(DATASETS_PATH)

In [229]:
datasets.head()

Unnamed: 0,Date,ALL,Hokkaido,Aomori,Iwate,Miyagi,Akita,Yamagata,Fukushima,Ibaraki,...,Ehime,Kochi,Fukuoka,Saga,Nagasaki,Kumamoto,Oita,Miyazaki,Kagoshima,Okinawa
0,2020/1/16,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020/1/17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020/1/18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020/1/19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020/1/20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [230]:
datasets["Kanagawa"]

0         1
1         0
2         0
3         0
4         0
       ... 
1204    591
1205    542
1206    433
1207    886
1208    567
Name: Kanagawa, Length: 1209, dtype: int64

In [231]:
# Allが必要ないので削除する
df = datasets.drop("ALL", axis=1)
df.head()

Unnamed: 0,Date,Hokkaido,Aomori,Iwate,Miyagi,Akita,Yamagata,Fukushima,Ibaraki,Tochigi,...,Ehime,Kochi,Fukuoka,Saga,Nagasaki,Kumamoto,Oita,Miyazaki,Kagoshima,Okinawa
0,2020/1/16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020/1/17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020/1/18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020/1/19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020/1/20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
# すでに列の順番が都道府県コードに沿ってるので、名前をコードに変換する
columns = {}
for i, column in enumerate(df.columns[1:]):
    columns[column] = str(i + 1)

df.rename(columns=columns, inplace=True)
df.head()

Unnamed: 0,Date,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,2020/1/16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020/1/17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020/1/18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020/1/19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020/1/20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Pandasで扱うと少し重いのでcsvとして出力

In [233]:
EXPORT_PATH = '../data/edited_newly_confirmed_cases_daily.csv'
COLUMNS = ["Prefecture_name", "Prefecture_code", "Date", "Infections"]
with open(EXPORT_PATH, 'w', encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(COLUMNS)
    for column in df.columns[1:]:
        for date, infections in zip(df["Date"], df[column]):
            writer.writerow([PREFECTURES[int(column) - 1], column, date, infections])

### pandasで読み込んで型を確認する

In [234]:
edited_df = pd.read_csv(EXPORT_PATH)
edited_df.dtypes

Prefecture_name    object
Prefecture_code     int64
Date               object
Infections          int64
dtype: object

### 日付順にソートして出力

In [236]:
edited_df = pd.read_csv(EXPORT_PATH)
edited_df.sort_values(["Date", "Prefecture_code"], inplace=True)
edited_df.head(50)

edited_df.to_csv("../data/japan_covid_19_cases_daily.csv", index=False)