In [86]:
import pandas as pd
from pathlib import Path
from src import EDA

In [87]:
DATA_ROOT = Path('data')

In [88]:
target_file_name = 'STORM_RAW_0.1.csv'

cols = ['DisNo.','DisasterType', 'DisasterSubtype', 'EventName',
       'MainLandfallLocation', 'Flood', 'Slide', 'OFDAResponse', 'Appeal',
       'Declaration', 'LandfallMagnitude(kph)', "LandfallPressure(mb)",
       'Year', 'Month', 'StartDay', 'EndYear', 'EndMonth',
       'EndDay', 'TotalDeaths', 'NoInjured', 'TotalDamage(000US$)',
       'TotalDamageAdjusted(000US$)', 'CPI', 'Completed']

df = pd.read_csv(DATA_ROOT / target_file_name)
df.columns = cols
df.drop(columns=["DisNo.", "StartDay", "EndYear", "EndMonth", "EndDay"], inplace=True)
df['CPI'] = df['CPI'].round(2)
df['OFDAResponse'] = df['OFDAResponse'].map({"Yes":1, "No":0})
df['Appeal'] = df['Appeal'].map({"Yes":1, "No":0})
df['Declaration'] = df['Declaration'].map({"Yes":1, "No":0})

df.head(5)

Unnamed: 0,DisasterType,DisasterSubtype,EventName,MainLandfallLocation,Flood,Slide,OFDAResponse,Appeal,Declaration,LandfallMagnitude(kph),LandfallPressure(mb),Year,Month,TotalDeaths,NoInjured,TotalDamage(000US$),TotalDamageAdjusted(000US$),CPI,Completed
0,Storm,Tropical cyclone,,8,0,0,0,0,0,,,1953,9,1000.0,,,,9.16,False
1,Storm,Tropical cyclone,Iris,5,0,0,1,0,0,120.0,996.0,1964,9,7000.0,,50000.0,471770.0,10.6,False
2,Storm,Tropical cyclone,Wanda,5,0,0,0,0,0,139.0,980.0,1971,5,23.0,,,,13.84,False
3,Storm,Tropical cyclone,,3,0,0,0,0,0,,,1971,10,89.0,,,,13.84,False
4,Storm,Tropical cyclone,Sarah,6,0,0,0,0,0,102.0,985.0,1973,11,100.0,,,,15.17,False


In [89]:
def getID(record:pd.Series) -> str:
    """Get ID of a single record"""
    prefix = f"{record['DisasterType']}_".upper()
    name = f"{record['EventName']}_".upper()
    year = f"{int(record['Year'])}"
    id = prefix + name + year
    return id

def generateUniqueID(df:pd.DataFrame):
    """Generate unique IDs for each record in the DataFrame"""
    ids = []
    id_counts = {}  # Từ điển để theo dõi số lần xuất hiện của mỗi ID

    for i, record in df.iterrows():
        new_id = getID(record)

        # Nếu ID đã xuất hiện, thêm số đếm vào ID
        if new_id in id_counts:
            id_counts[new_id] += 1
            unique_id = f"{new_id}_{id_counts[new_id]}"
        else:
            id_counts[new_id] = 1
            unique_id = new_id

        ids.append(unique_id)
    
    return ids

def isUniqueID(id_list:list):
    """Check the id list"""
    
    from collections import Counter
    count = Counter(id)
    duplicates = [item for item, freq in count.items() if freq > 1]
    if len(duplicates) >= 1:
        return (duplicates, False)
    else:
        return (True)

id = generateUniqueID(df)
isUniqueID(id)
df['ID'] = id
cols = df.columns.to_list()
cols = ['ID'] + cols[:-1]
df = df[cols]
df.head()

Unnamed: 0,ID,DisasterType,DisasterSubtype,EventName,MainLandfallLocation,Flood,Slide,OFDAResponse,Appeal,Declaration,LandfallMagnitude(kph),LandfallPressure(mb),Year,Month,TotalDeaths,NoInjured,TotalDamage(000US$),TotalDamageAdjusted(000US$),CPI,Completed
0,STORM_NAN_1953,Storm,Tropical cyclone,,8,0,0,0,0,0,,,1953,9,1000.0,,,,9.16,False
1,STORM_IRIS_1964,Storm,Tropical cyclone,Iris,5,0,0,1,0,0,120.0,996.0,1964,9,7000.0,,50000.0,471770.0,10.6,False
2,STORM_WANDA_1971,Storm,Tropical cyclone,Wanda,5,0,0,0,0,0,139.0,980.0,1971,5,23.0,,,,13.84,False
3,STORM_NAN_1971,Storm,Tropical cyclone,,3,0,0,0,0,0,,,1971,10,89.0,,,,13.84,False
4,STORM_SARAH_1973,Storm,Tropical cyclone,Sarah,6,0,0,0,0,0,102.0,985.0,1973,11,100.0,,,,15.17,False


In [90]:
df.to_csv(DATA_ROOT / "STORM_RAW_0.2.csv")