In [2]:
import csv
import datetime
import pandas as pd

In [3]:
DATASETS_PATH = "../data/newly_confirmed_cases_daily.csv"
PREFECTURES_CODE = {
    1: '北海道', 2: '青森県', 3: '岩手県', 4: '宮城県', 5: '秋田県',
    6: '山形県', 7: '福島県', 8: '茨城県', 9: '栃木県', 10: '群馬県',
    11: '埼玉県', 12: '千葉県', 13: '東京都', 14: '神奈川県', 15: '新潟県',
    16: '富山県', 17: '石川県', 18: '福井県', 19: '山梨県', 20: '長野県',
    21: '岐阜県', 22: '静岡県', 23: '愛知県', 24: '三重県', 25: '滋賀県',
    26: '京都府', 27: '大阪府', 28: '兵庫県', 29: '奈良県', 30: '和歌山県',
    31: '鳥取県', 32: '島根県', 33: '岡山県', 34: '広島県', 35: '山口県',
    36: '徳島県', 37: '香川県', 38: '愛媛県', 39: '高知県', 40: '福岡県',
    41: '佐賀県', 42: '長崎県', 43: '熊本県', 44: '大分県', 45: '宮崎県',
    46: '鹿児島県', 47: '沖縄県'
}
PREFECTURES = list(PREFECTURES_CODE.keys())

# データセットを扱いやすくする

現在のデータセットだと少し扱いづらいので、少し加工します。

- 都道府県コードで紐づけることができるように
- keyを日付からそれぞれの県の感染者数に変更する

## イメージ

| id | Prefecture_name | Prefecture_code | Infections | Date |
| -- | --------------- | --------------- | ---------- | ---- |

In [4]:
datasets = pd.read_csv(DATASETS_PATH)

In [5]:
datasets.head()

Unnamed: 0,Date,ALL,Hokkaido,Aomori,Iwate,Miyagi,Akita,Yamagata,Fukushima,Ibaraki,...,Ehime,Kochi,Fukuoka,Saga,Nagasaki,Kumamoto,Oita,Miyazaki,Kagoshima,Okinawa
0,2020/1/16,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020/1/17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020/1/18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020/1/19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020/1/20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
datasets["Kanagawa"]

0         1
1         0
2         0
3         0
4         0
       ... 
1204    591
1205    542
1206    433
1207    886
1208    567
Name: Kanagawa, Length: 1209, dtype: int64

In [7]:
# Allが必要ないので削除する
df = datasets.drop("ALL", axis=1)
df.head()

Unnamed: 0,Date,Hokkaido,Aomori,Iwate,Miyagi,Akita,Yamagata,Fukushima,Ibaraki,Tochigi,...,Ehime,Kochi,Fukuoka,Saga,Nagasaki,Kumamoto,Oita,Miyazaki,Kagoshima,Okinawa
0,2020/1/16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020/1/17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020/1/18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020/1/19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020/1/20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# すでに列の順番が都道府県コードに沿ってるので、名前をコードに変換する
columns = {}
for i, column in enumerate(df.columns[1:]):
    columns[column] = str(i + 1)

df.rename(columns=columns, inplace=True)
df.head()

Unnamed: 0,Date,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,2020/1/16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020/1/17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020/1/18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020/1/19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020/1/20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Pandasで扱うと少し重いのでcsvとして出力

In [9]:
EXPORT_PATH = '../data/edited_newly_confirmed_cases_daily.csv'
COLUMNS = ["Prefecture_name", "Prefecture_code", "Date", "Infections"]
with open(EXPORT_PATH, 'w', encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(COLUMNS)
    for column in df.columns[1:]:
        for date, infections in zip(df["Date"], df[column]):
            writer.writerow([PREFECTURES_CODE[int(column)], column, date, infections])

### pandasで読み込んで型を確認する

In [10]:
edited_df = pd.read_csv(EXPORT_PATH)
edited_df.dtypes

Prefecture_name    object
Prefecture_code     int64
Date               object
Infections          int64
dtype: object

### 日付順にソートして出力

In [13]:
edited_df = pd.read_csv(EXPORT_PATH)
edited_df.sort_values(["Date", "Prefecture_code"], inplace=True)

edited_df.to_csv("../data/japan_covid_19_cases_daily.csv", index=False)
edited_df.head(50)

Unnamed: 0,Prefecture_name,Prefecture_code,Date,Infections
0,北海道,1,2020/1/16,0
1209,青森県,2,2020/1/16,0
2418,岩手県,3,2020/1/16,0
3627,宮城県,4,2020/1/16,0
4836,秋田県,5,2020/1/16,0
6045,山形県,6,2020/1/16,0
7254,福島県,7,2020/1/16,0
8463,茨城県,8,2020/1/16,0
9672,栃木県,9,2020/1/16,0
10881,群馬県,10,2020/1/16,0


: 