In [1]:
# make imports
import pandas as pd

import numpy as np

from tsfresh.utilities.dataframe_functions import roll_time_series

In [2]:
# files for feature time series
cholera_case_crosstab = pd.read_csv(r'C:\Users\Rohil\Documents\iGEM\yemen\cholera_epi_data\yemen_cholera_case_data_differenced.csv', dayfirst = True)
conflict_crosstab = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/fatality_crosstab.csv')
cholera_death_crosstab = pd.read_csv(r'C:\Users\Rohil\Documents\iGEM\yemen\cholera_epi_data\yemen_cholera_death_data_differenced.csv', dayfirst = True)
max_rainfall_crosstab = pd.read_csv(r'C:\Users\Rohil\Documents\iGEM\yemen\rainfall\yemen_daily_max_rainfall_crosstab.csv', dayfirst = True)
mean_rainfall_crosstab = pd.read_csv(r'C:\Users\Rohil\Documents\iGEM\yemen\rainfall\yemen_daily_mean_rainfall_crosstab.csv', dayfirst = True)
min_rainfall_crosstab = pd.read_csv(r'C:\Users\Rohil\Documents\iGEM\yemen\rainfall\yemen_daily_min_rainfall_crosstab.csv', dayfirst = True)

In [3]:
# data on how many people are in each governorate
gov_pop_area_data = pd.read_excel('/Users/Rohil/Documents/iGEM/yemen/gov_area_pop_data.xlsx')

# we don't have data for the entire governorate of hadramawut, only for the district of Al Mukulla
gov_pop_area_data = gov_pop_area_data[gov_pop_area_data.iso != 'YE-HD']

In [4]:
min_rainfall_crosstab.date = mean_rainfall_crosstab.date

In [5]:
# scaling values up by 10k and normalizing by population of governorate
# essentially the val per 10k people in the governorate
norm_cholera_case_crosstab = cholera_case_crosstab
for index, row in gov_pop_area_data[['iso', 'population']].iterrows():
    norm_cholera_case_crosstab[row.iso] = (norm_cholera_case_crosstab[row.iso] * 10000) / row.population

In [6]:
norm_cholera_death_crosstab = cholera_death_crosstab
for index, row in gov_pop_area_data[['iso', 'population']].iterrows():
    norm_cholera_death_crosstab[row.iso] = (norm_cholera_death_crosstab[row.iso] * 10000) / row.population

In [7]:
norm_conflict_crosstab = conflict_crosstab
for index, row in gov_pop_area_data[['iso', 'population']].iterrows():
    norm_conflict_crosstab[row.iso] = (norm_conflict_crosstab[row.iso] * 10000) / row.population

In [8]:
# flattening dataframes

cases_unstacked = norm_cholera_case_crosstab.set_index('date').unstack().reset_index()
cases_unstacked.columns = ['gov_iso', 'date', 'new_cases']

deaths_unstacked = norm_cholera_death_crosstab.set_index('date').unstack().reset_index()
deaths_unstacked.columns = ['gov_iso', 'date', 'new_deaths']

fatalities_unstacked = norm_conflict_crosstab.set_index('date').unstack().reset_index()
fatalities_unstacked.columns = ['gov_iso', 'date', 'fatalities']

mean_rainfall_unstacked = mean_rainfall_crosstab.set_index('date').unstack().reset_index()
mean_rainfall_unstacked.columns = ['gov_iso', 'date', 'mean_rainfall']

min_rainfall_unstacked = min_rainfall_crosstab.set_index('date').unstack().reset_index()
min_rainfall_unstacked.columns = ['gov_iso', 'date', 'min_rainfall']

max_rainfall_unstacked = max_rainfall_crosstab.set_index('date').unstack().reset_index()
max_rainfall_unstacked.columns = ['gov_iso', 'date', 'max_rainfall']

In [9]:
# formatting dates
cases_unstacked.date = pd.to_datetime(cases_unstacked.date, format = "%d-%m-%y")
deaths_unstacked.date = pd.to_datetime(deaths_unstacked.date, format = "%d-%m-%y")
mean_rainfall_unstacked.date = pd.to_datetime(mean_rainfall_unstacked.date, format = "%d-%m-%y")
min_rainfall_unstacked.date = pd.to_datetime(min_rainfall_unstacked.date, format = "%d-%m-%y")
max_rainfall_unstacked.date = pd.to_datetime(max_rainfall_unstacked.date, format = "%d-%m-%y")
fatalities_unstacked.date = pd.to_datetime(fatalities_unstacked.date, format = "%Y-%m-%d")

In [10]:
feature_timeseries = cases_unstacked.merge(deaths_unstacked, on = ['date', 'gov_iso']).merge(fatalities_unstacked, on =['date', 'gov_iso'], how = 'left').merge(mean_rainfall_unstacked, on =['date', 'gov_iso'], how = 'left').merge(min_rainfall_unstacked, on =['date', 'gov_iso'], how = 'left').merge(max_rainfall_unstacked, on =['date', 'gov_iso'], how = 'left')

In [11]:
neighboring_gov_dict = {"YE-SA" : ["YE-SN"],
                        "YE-AB" : ["YE-LA", "YE-SH", "YE-BA"],
                        "YE-AD" : ["YE-LA"],
                        "YE-DA" : ["YE-LA", "YE-TA", "YE-IB", "YE-BA"],
                        "YE-BA" : ["YE-DH", "YE-IB", "YE-DA", "YE-AB", "YE-SH", "YE-MA", "YE-SN", "YE-LA"],
                        "YE-HU" : ["YE-HJ", "YE-MW", "YE-SN", "YE-RA", "YE-DH", "YE-TA", "YE-IB"],
                        "YE-JA" : ["YE-MA", "YE-SN", "YE-AM", "YE-SD"],
                        "YE-MR" : ["YE-HD-AL"],
                        "YE-MW" : ["YE-HU", "YE-HJ", "YE-AM", "YE-SN"],
                        "YE-AM" : ["YE-HJ", "YE-SD", "YE-JA", "YE-SN", "YE-MW"],
                        "YE-DH" : ["YE-IB", "YE-RA", "YE-SN", "YE-BA", "YE-HU"],
                        "YE-HD-AL" : ["YE-SH", "YE-MR"],
                        "YE-HJ" : ["YE-MW", "YE-HU", "YE-AM", "YE-SD"],
                        "YE-IB" : ["YE-TA", "YE-HU", "YE-DH", "YE-BA", "YE-DA"],
                        "YE-LA" : ["YE-AD", "YE-TA", "YE-DA", "YE-BA", "YE-AB"],
                        "YE-MA" : ["YE-BA", "YE-SN", "YE-JA", "YE-SH"],
                        "YE-RA" : ["YE-DH", "YE-HU", "YE-SN"],
                        "YE-SD" : ["YE-HJ", "YE-AM", "YE-JA"],
                        "YE-SN" : ["YE-BA", "YE-DH", "YE-RA", "YE-MW", "YE-AM", "YE-JA", "YE-MA", "YE-HU"],
                        "YE-SH" : ["YE-AB", "YE-BA", "YE-MA", "YE-HD-AL"],
                        "YE-TA" : ["YE-LA", "YE-DA", "YE-IB", "YE-HU"]}

In [12]:
#neighbor_feature_timeseries = pd.DataFrame(columns = feature_timeseries.columns)

In [13]:
def get_neighboring_vars(row, var, dataset):
        
    neighbor_data = dataset[(dataset.date == row.date) & (dataset.gov_iso.isin(neighboring_gov_dict[row.gov_iso]))]
        
    #neighbor_data['gov_iso'] = row.gov_iso + '-neighbor'

    return (np.mean(neighbor_data[var]))

In [14]:
for var in ['new_cases', 'new_deaths', 'fatalities', 'mean_rainfall', 'min_rainfall', 'max_rainfall']:
    feature_timeseries['neighbor_' + var] = feature_timeseries.apply(get_neighboring_vars, args = (var, feature_timeseries,), axis=1)

In [15]:
#neighbor_feature_timeseries = feature_timeseries.apply(get_neighboring_vars, args = (feature_timeseries,), axis=1)

In [16]:
feature_timeseries

Unnamed: 0,gov_iso,date,new_cases,new_deaths,fatalities,mean_rainfall,min_rainfall,max_rainfall,neighbor_new_cases,neighbor_new_deaths,neighbor_fatalities,neighbor_mean_rainfall,neighbor_min_rainfall,neighbor_max_rainfall
0,YE-AB,2017-05-23,0.901606,0.015179,0.000000,0.000000,0.0,0.000000,0.943220,0.002315,0.000000,0.064464,0.0,0.742962
1,YE-AB,2017-05-24,0.901606,0.015179,0.000000,0.000000,0.0,0.000000,0.943220,0.002315,0.000000,0.228871,0.0,1.325468
2,YE-AB,2017-05-25,0.901606,0.015179,0.000000,0.000000,0.0,0.000000,0.943220,0.002315,0.000039,0.032354,0.0,0.550024
3,YE-AB,2017-05-26,0.901606,0.015179,0.000000,0.000000,0.0,0.000000,0.943220,0.002315,0.000000,0.322169,0.0,3.101775
4,YE-AB,2017-05-27,0.901606,0.015179,0.000000,1.486777,0.0,7.380904,0.943220,0.002315,0.000785,2.122804,0.0,23.119947
5,YE-AB,2017-05-28,1.123213,0.005060,0.000000,0.016977,0.0,0.458392,0.974399,0.005188,0.000000,0.246722,0.0,1.767995
6,YE-AB,2017-05-29,1.123213,0.005060,0.000000,5.410453,0.0,54.309643,0.974399,0.005188,0.000239,1.580253,0.0,13.095566
7,YE-AB,2017-05-30,1.123213,0.005060,0.000000,13.253213,0.0,59.607925,0.974399,0.005188,0.000000,0.840843,0.0,3.737630
8,YE-AB,2017-05-31,1.010892,0.000000,0.000000,0.000000,0.0,0.000000,0.595513,0.001596,0.000157,0.000000,0.0,0.000000
9,YE-AB,2017-06-01,1.010892,0.000000,0.000000,0.000000,0.0,0.000000,0.595513,0.001596,0.000860,0.000000,0.0,0.000000


In [17]:
# flat_neighbor_feature_timeseries = pd.DataFrame(columns=feature_timeseries.columns)
# for row in neighbor_feature_timeseries.iteritems():
#     flat_neighbor_feature_timeseries = flat_neighbor_feature_timeseries.append(row[1])

In [18]:
feature_timeseries_cols = feature_timeseries.drop(['gov_iso', 'date'], axis=1).columns

In [19]:
feature_timeseries['days_from'] = feature_timeseries.date - feature_timeseries.iloc[0].date
feature_timeseries['days_from'] = feature_timeseries.days_from.dt.days

In [20]:
feature_timeseries.head()

Unnamed: 0,gov_iso,date,new_cases,new_deaths,fatalities,mean_rainfall,min_rainfall,max_rainfall,neighbor_new_cases,neighbor_new_deaths,neighbor_fatalities,neighbor_mean_rainfall,neighbor_min_rainfall,neighbor_max_rainfall,days_from
0,YE-AB,2017-05-23,0.901606,0.015179,0.0,0.0,0.0,0.0,0.94322,0.002315,0.0,0.064464,0.0,0.742962,0
1,YE-AB,2017-05-24,0.901606,0.015179,0.0,0.0,0.0,0.0,0.94322,0.002315,0.0,0.228871,0.0,1.325468,1
2,YE-AB,2017-05-25,0.901606,0.015179,0.0,0.0,0.0,0.0,0.94322,0.002315,3.9e-05,0.032354,0.0,0.550024,2
3,YE-AB,2017-05-26,0.901606,0.015179,0.0,0.0,0.0,0.0,0.94322,0.002315,0.0,0.322169,0.0,3.101775,3
4,YE-AB,2017-05-27,0.901606,0.015179,0.0,1.486777,0.0,7.380904,0.94322,0.002315,0.000785,2.122804,0.0,23.119947,4


In [29]:
feature_timeseries.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/exhaustive_feature_extraction_prep/unrolled_timeseries.csv', index = False)

In [21]:
def roll_timeseries_by_gov(timeseries, weeks_back):
    
    rolled_feature_timeseries = pd.DataFrame(columns=feature_timeseries.columns)
    
    for gov in timeseries.gov_iso.unique():
        gov_timeseries = timeseries[timeseries.gov_iso == gov]
        rolled_gov_timeseries = roll_time_series(gov_timeseries, column_id = 'gov_iso', column_sort = 'days_from', column_kind = None, rolling_direction = 1, max_timeshift = (weeks_back * 7))
        rolled_gov_timeseries['id_col'] = rolled_gov_timeseries.gov_iso.map(str) + '_' + gov
        
        rolled_gov_timeseries.id_col = rolled_gov_timeseries.id_col.str.split('_').map(tuple)
        
        rolled_feature_timeseries = rolled_feature_timeseries.append(rolled_gov_timeseries)
    
    return (rolled_feature_timeseries)

In [22]:
week_1_rolled_timeseries = roll_timeseries_by_gov(feature_timeseries, 1)
week_2_rolled_timeseries = roll_timeseries_by_gov(feature_timeseries, 2)
week_4_rolled_timeseries = roll_timeseries_by_gov(feature_timeseries, 4)
week_6_rolled_timeseries = roll_timeseries_by_gov(feature_timeseries, 6)

  differences = df.groupby(grouper)[column_sort].apply(
  grouped_data = df.groupby(grouper)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [23]:
week_1_rolled_timeseries.shape

(45108, 16)

In [24]:
week_2_rolled_timeseries.shape

(83475, 16)

In [25]:
week_4_rolled_timeseries.shape

(157122, 16)

In [26]:
week_6_rolled_timeseries.shape

(226653, 16)

In [28]:
week_1_rolled_timeseries.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/exhaustive_feature_extraction_prep/week_1_rolled_timeseries.csv', index = False)

week_2_rolled_timeseries.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/exhaustive_feature_extraction_prep/week_2_rolled_timeseries.csv', index = False)

week_4_rolled_timeseries.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/exhaustive_feature_extraction_prep/week_4_rolled_timeseries.csv', index = False)

week_6_rolled_timeseries.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/exhaustive_feature_extraction_prep/week_6_rolled_timeseries.csv', index = False)