In [38]:
import pandas as pd
import numpy as np

In [39]:
df=pd.read_csv('data/origindata.csv')

In [41]:
#Chuyển cột ngày về định dạng datetime và sắp xếp theo thời gian

df['date'] = pd.to_datetime(df['date'], errors='coerce')

df = df.sort_values(['location', 'date'])

In [42]:
# Chọn các cột cần thiết cho bài toán

cols_needed = [
    'location', 'date', 'new_cases', 'total_cases', 'new_deaths', 'icu_patients',
    'total_tests', 'positive_rate', 'population', 'stringency_index'
]

df = df[cols_needed]

In [43]:
# Loại bỏ các quốc gia có quá nhiều giá trị thiếu trong 'new_cases'

missing_thresh = 0.2  # Nếu > 20% dữ liệu bị thiếu, loại bỏ
location_missing = df.groupby('location')['new_cases'].apply(lambda x: x.isna().mean())
valid_locations = location_missing[location_missing <= missing_thresh].index
df = df[df['location'].isin(valid_locations)]

In [None]:
# Điền giá trị thiếu (NaN)
# - numerical: ffill + bfill theo từng location

df = df.groupby('location').apply(lambda group: group.ffill().bfill()).reset_index(drop=True)

# - Thay thế các giá trị âm (nếu có) hoặc outlier bất thường bằng 0

for col in ['new_cases', 'total_cases', 'new_deaths', 'icu_patients', 'total_tests']:
    df[col] = df[col].apply(lambda x: max(x, 0) if pd.notna(x) else x)

In [45]:
# Chuẩn hóa các cột numerical (chuẩn hóa theo từng quốc gia)

def min_max_scale(series):
    min_val = series.min()
    max_val = series.max()
    if min_val == max_val:
        return series * 0  # hoặc trả về series gốc nếu không muốn scale khi giá trị bằng nhau
    return (series - min_val) / (max_val - min_val)

def scale_by_location(group):
    group_scaled = group.copy()
    num_cols = group.select_dtypes(include=[np.number]).columns.difference(['population'])
    for col in num_cols:
        group_scaled[col] = min_max_scale(group[col])
    return group_scaled

df = df.groupby('location').apply(scale_by_location).reset_index(drop=True)

In [46]:
df

Unnamed: 0,location,date,new_cases,total_cases,new_deaths,icu_patients,total_tests,positive_rate,population,stringency_index
0,Afghanistan,2020-01-05,0.0,0.0,0.0,,0.0,0.941176,41128772,0.000000
1,Afghanistan,2020-01-06,0.0,0.0,0.0,,0.0,0.941176,41128772,0.000000
2,Afghanistan,2020-01-07,0.0,0.0,0.0,,0.0,0.941176,41128772,0.000000
3,Afghanistan,2020-01-08,0.0,0.0,0.0,,0.0,0.941176,41128772,0.000000
4,Afghanistan,2020-01-09,0.0,0.0,0.0,,0.0,0.941176,41128772,0.000000
...,...,...,...,...,...,...,...,...,...,...
395553,Zimbabwe,2024-07-31,0.0,1.0,0.0,,1.0,0.090909,16320539,0.610505
395554,Zimbabwe,2024-08-01,0.0,1.0,0.0,,1.0,0.090909,16320539,0.610505
395555,Zimbabwe,2024-08-02,0.0,1.0,0.0,,1.0,0.090909,16320539,0.610505
395556,Zimbabwe,2024-08-03,0.0,1.0,0.0,,1.0,0.090909,16320539,0.610505
