In [6]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from countryinfo import CountryInfo

In [13]:
# Load data from file
flood = pd.read_excel('FloodArchive.xlsx', engine='openpyxl')
flood = np.array(flood)
area_list = pd.read_csv('area.csv')
area_list = np.array(area_list)

geolocator = Nominatim(user_agent="geoapiExercises")
flood_cleaned = ['ID','GlideNumber','Country','OtherCountry','Long','Lat','Area','Began','Ended','Validation','Dead','Displaced','MainCause','Severity']
unclassified = []
deleted = []
delimiters = [',and ', ',', ' and ', ', ', 'or']

In [3]:
# clean space and other undentified characters in data
for i in range(flood.shape[0]):
    if isinstance(flood[i][1],str):
        flood[i][1] = flood[i][1].replace(u'\xa0', '').replace(',', '').strip(" ")
    flood[i][2] = flood[i][2].replace(u'\xa0', '').strip(" ")
    if isinstance(flood[i][3],str):
        flood[i][3] = flood[i][3].replace(u'\xa0', '').strip(" ")
    if isinstance(flood[i][12],str):
        for delimiter in delimiters:
            if flood[i][12].find(delimiter)>=0:
                flood[i][12] = flood[i][12].replace(delimiter, '/').replace(u'\xa0', '').strip(" ")

In [4]:
# get country names according to longitude and lattitude
def get_country(coord):
    location = geolocator.reverse(coord, exactly_one=True, language='en')
    if location is not None:
        address = location.raw['address']
        country = address.get('country', '')
        return country
    else:
        return None

In [5]:
# data cleaning process
for i in range(flood.shape[0]):
    record = flood[i]
    other_country = record[3]
    area = record[6]
    year = record[7].year
    coordinate = str(record[5])+", "+str(record[4])
    actual_country = get_country(coordinate)
    record[2] = actual_country
    
    # correct country names
    if actual_country in area_list[:,0]:
        devia = year - 1985 + 25
        index = area_list[:,0].tolist().index(actual_country)
        actual_area = area_list[index, devia]
    else:
        unclassified.append(record)
        continue
    
    if area <= actual_area:
        flood_cleaned.append(record)
    else:
        print(f"ID {record[0]} has quality issue with area beyond actual value")
        deleted.append(record)

ID 80 has quality issue with area beyond actual value
ID 106 has quality issue with area beyond actual value
ID 146 has quality issue with area beyond actual value
ID 196 has quality issue with area beyond actual value
ID 227 has quality issue with area beyond actual value
ID 490 has quality issue with area beyond actual value
ID 508 has quality issue with area beyond actual value
ID 558 has quality issue with area beyond actual value
ID 611 has quality issue with area beyond actual value
ID 706 has quality issue with area beyond actual value
ID 747 has quality issue with area beyond actual value
ID 757 has quality issue with area beyond actual value
ID 812 has quality issue with area beyond actual value
ID 889 has quality issue with area beyond actual value
ID 918 has quality issue with area beyond actual value
ID 933 has quality issue with area beyond actual value
ID 975 has quality issue with area beyond actual value
ID 978 has quality issue with area beyond actual value
ID 993 has 

In [6]:
flood_cleaned = np.array(flood_cleaned)
flood_cleaned.shape

(4977, 14)

In [7]:
np.savetxt('flood_cleaned.csv', flood_cleaned, delimiter=',', fmt = '%s')

In [8]:
np.savetxt('unclassified.csv', unclassified, delimiter=',', fmt = '%s')

In [105]:
forest_list = np.array(pd.read_csv('forest area.csv'))
co2_list = np.array(pd.read_csv('co2 emission.csv'))
flood_cleaned = pd.read_csv('flood_cleaned.csv')
country_list = set(flood_cleaned['Country'].tolist())

In [106]:
forest_cleaned = []
co2_cleaned = []
for forest in forest_list:
    if forest[0] in country_list:
        forest_cleaned.append(forest)
        
for co2 in co2_list:
    if co2[0] in country_list:
        co2_cleaned.append(co2)

In [107]:
np.savetxt('co2_cleaned.csv', co2_cleaned, delimiter=',', fmt = '%s')
np.savetxt('forest_cleaned.csv', forest_cleaned, delimiter=',', fmt = '%s')

In [23]:
forest_list = np.array(pd.read_csv('forest_cleaned.csv'))
co2_list = np.array(pd.read_csv('co2_cleaned.csv'))

In [24]:
flood_cleaned = np.array(pd.read_csv('flood_cleaned.csv'))

In [34]:
def get_factor_list(country):
    forest = []
    co2 = []
    floods = []
    amount = []
    years = np.arange(1990,2019).reshape(29,1)
    for item in forest_list:
        if item[0] == country:
            forest = np.array(item[1:]).reshape(29, 1)
    forest = np.column_stack((years, forest))
    for item in co2_list:
        if item[0] == country:
            co2 = np.array(item[1:]).reshape(29, 1)
    co2 = np.column_stack((years, co2))
    for item in flood_cleaned:
        if item[2] ==country:
            floods.append(item)  
    for i in np.arange(1990,2019):
        n = 0
        for item in floods:
            if float(item[7][0:4]) == i:
                n = n+1
        amount.append([i,n])
    np.savetxt('factor/'+country+'_forest.csv', forest, delimiter=',', fmt = '%s')
    np.savetxt('factor/'+country+'_co2.csv', co2, delimiter=',', fmt = '%s')
    np.savetxt('factor/'+country+'_amount.csv', amount, delimiter=',', fmt = '%s')
    return forest, co2, amount

In [35]:
countries = ['China', 'United States', 'India', 'Indonesia', 'Philippines']

In [36]:
for country in countries:
    get_factor_list(country)