In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import random
from datetime import timedelta  

In [4]:
dpath = {}
dpath['home'] = './data/conflicts'
dpath['subfolder'] = 'input'
dpath['filename'] = '2001-01-01-2019-10-01-Djibouti-Ethiopia-Kenya-Somalia.csv'
dfc = pd.read_csv(os.path.join(dpath['home'],dpath['subfolder'],dpath['filename']))

# Filtering for somalian cases
dfc = dfc[[i[:3] == 'SOM' for i in dfc['event_id_cnty'].values]]

# Dropping peaceful protests
dfc = dfc.drop(dfc[dfc['sub_event_type'] == 'Peaceful protest'].index)

# Typesetting
dfc['ts'] = pd.to_datetime(dfc['event_date'])
dfc['latitude'] = dfc['latitude'].astype(float)
dfc['longitude'] = dfc['longitude'].astype(float)
dfc['fatalities'] = dfc['fatalities'].astype(int)

dfc = dfc[['ts','latitude','longitude','fatalities']]
dfc.columns = ['ts','lat','lon','fatalities']

# Write to csv
dpath = {}
dpath['home'] = './data/conflicts'
dpath['subfolder'] = 'input'
dpath['filename'] = '2001-01-01-2019-10-01-Somalia-Events-Only.csv'
# dfc.to_csv(os.path.join(dpath['home'],dpath['subfolder'],dpath['filename']))

Choices that I made during preprocessing that need to be confirmed for validity:

1. To generate the heat map we only need lat,lon,fatality and time information. <br>
2. Filter for somalia only, later the data can be augemented with surrounding countries, however the test set should only be somalia. <br>
3. Filted out peacefull protests. Are we trying to predict all the conflict subevents? Or or some types of conflicts more related to droughts / displacements.? Are for example peacefull protests linked to drought? 

Question: Is there also displacement data available for surrounding countring like ethiopia, djibouti, kenya?


In [None]:
df = dfc

In [None]:
from utils import create_idx_fixed_minmax

In [None]:
# Define the grid which we want to use to divide your map into zones

# map_width = 5
# map_height = 3 # Number of zones that the map will be divided vertically 

map_width = 64
map_height = 64 # 

grid = (map_width,map_height)

# Add a grid position to a dataframe with a 'lat' and a 'lon' column
# Map (lon, lat) into (grid_h, grid_w)
df_indexed = create_idx_fixed_minmax(df, grid)

In [None]:
df_indexed.head()

In [None]:
############################################################################################
# Create heatmaps
############################################################################################

# Hyperparameters for heatmap construction
T = 1 # Period frequency 1, 2, 3 ,4 
freq = 'W' # Date frequency that can be D, W, M for Days, Weeks, Months and so on

# Aggregate (time, square, category) to create the incident map sum 
df_indexed_count = df_indexed.groupby(['grid_w','grid_h','fatalities', pd.Grouper(key='ts', freq=freq)],as_index=True).size()    
df_indexed_count = df_indexed_count.reset_index()  
df_indexed_count.rename(columns ={0:'CI'},inplace=True)
df_indexed_count['CI'] += df_indexed_count['fatalities']
df_indexed_count = df_indexed_count.sort_values(by=['ts'])

In [None]:
df_indexed_count.head()

In [None]:
############################################################################################
# Create our data as dictionary for easy access composed of tuples (heatmap, coords) selected by date key
############################################################################################
heat_data = {}

for ts in df_indexed_count.ts.unique():
    
    #print("Processing data: {0}".format(ts))   

    # Select data just for this timestamp
    points_by_date = df_indexed_count[df_indexed_count.ts == ts]
    # Create heatmap
    heatmap = np.zeros(grid)
    
    # Get incident coordinates
    conf_coords = []
    
    # Parse crime incident and heatmap
    for index, row in points_by_date.iterrows():
        conf_coords.append((row['grid_w'],row['grid_h']))
        
        heatmap[int(row['grid_w'])][int(row['grid_h'])] += row['CI']
    
    conf_coords = np.asarray(conf_coords)
    
    # Store tuple heatmap and crime coordinates on dictionary
    heat_data[pd.to_datetime(ts)] = (heatmap, conf_coords)

print("Data size: {0}".format(len(heat_data)))

In [None]:
heat_df = pd.DataFrame(heat_data).T
heat_df.columns = ['heatmap','coords']

In [None]:
heat_df.head(1)

In [None]:
# Write to csv
dpath = {}
dpath['home'] = './data/conflicts'
dpath['subfolder'] = 'output'
dpath['filename'] = 'Somalia-Conflicts-2010-2019-64x64grid-weekly.csv'
heat_df.to_csv(os.path.join(dpath['home'],dpath['subfolder'],dpath['filename']))

In [None]:
# # number of missing dates per year

# dti = pd.date_range(start = '1991-01-01', end = '2019-09-28' ).difference(heat_df.index)
# from collections import Counter
# Counter([str(i)[0:4] for i in list(dti)])

In [None]:
############################################################################################
# Plot some samples
############################################################################################

# last 100 weeks
n_samples = heat_df.shape[0]
heat_df = heat_df[-100:]

for i in range(100):

    # randomly select a sample
#     date = list(data.keys())[i]
    date = heat_df.index[i]
    datestring = str(date).split(' ')[0]
    
    # Get heatmap aggregation and incident coordinates
    coords = heat_df.loc[date,'coords']
    heatmap = heat_df.loc[date,'heatmap']
    
    # define the size of images
    f, ax = plt.subplots()
    f.set_figwidth(6)
    f.set_figheight(6)
    
    ax.set_title("Heatmap: {0}".format(date))
    imtitle = "Heatmap-{}".format(datestring) + '.png'
    
    ax.scatter(x=coords[:,1],y=coords[:,0], marker='x', s=30, color='green') 
    ax.matshow(heatmap, cmap='jet',aspect = ax.get_aspect(), extent = (0,map_height) + (map_width,0))
    
    dpath = {}
    dpath['home'] = './data/conflicts'
    dpath['subfolder'] = 'images/weekly'
    dpath['filename'] = imtitle

    f.savefig(os.path.join(dpath['home'],dpath['subfolder'],dpath['filename']))