<a href="https://colab.research.google.com/github/LeonardoViotti/cdr-training/blob/develop/notebooks/aggregated-cdr-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CDR Analysis

 - Aggregated data
 - Quality Checks
 - Analysis
 - Map

# Environment set-up
Run the cell below:

In [None]:
#------------------------------------------------------------------------
# Libraries installation

# !pip install geopandas

#------------------------------------------------------------------------
# Useful functions

def time_complete(data, timefreq = 'D'):
    data = data.reset_index()
    timevar = data.columns[0]
    data[timevar] = data[timevar].astype('datetime64[D]')
    full_time_range = pd.date_range(data[timevar].min(),  
                                            data[timevar].max(), 
                                            freq = timefreq)
    data = data.set_index(timevar)
    data = data.reindex(full_time_range,  fill_value=0)
    data.index.name = 'date'
    return(data)


# Let's start

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd

import datetime as dt

First, let's import the datasets we will use on this exercise

In [None]:
# from google.colab import files
# files.upload()

In [None]:
!ls

Before we start, let's load and have a look at each one of the indicators.

In [None]:
# Load transactions per day data
trans = pd.read_csv('transactions_per_day.csv')
trans.head()

In [None]:
# Load subscribers per day data
subs = pd.read_csv('subscribers_per_day.csv')
subs.head()

In [None]:
# Load movements

## Exercise 1 - Quality checks

In [None]:
# Aggregate data by day summing values across all regions
subs_day = subs.groupby('date').agg({'value': np.sum})

In [None]:
# PLot
subs_day = time_complete(subs_day)
subs_day.plot()

## Exercise 2 - Changes over time

On the orinal data (region and day level) create percent change columns:
- Change from previous day
- Change from Baseline (defined as the average for each region in February)

Plot day level data just like the previous exercisse.

Timeline:
- February 1st to March 15th: Baseline period
- March 16th: initial restrictions imposed
- March 30th: Lockdown measures on parts of Accra and Kumasi metropolitan areas

In [None]:
subs

In [None]:
# Create lag variable
# Absolute change from previous day
# % change from previous day
# plot

In [None]:
def day_lag(df):
    # Makse sure date is datetime type
    df['date'] = pd.to_datetime(df['date'])
    
    # Sort by region and date
    df = df.sort_values(['pcod', 'date'])
    
    # Lag value
    df['value_l'] = df.groupby('pcod')['value'].shift(1)
    
    # Drop values if missing dates
    df['value_l'] = df['value_l'].where(df.groupby('pcod').date.diff() == dt.timedelta(days = 1), np.nan)
    
    return df

day_lag(subs)


In [None]:
# ghanna lockdown?
# Crate baseline average Feb values
# % change from baseline
# plot

In [None]:
def bl_values(df):
    # Makse sure date is datetime type
    df['date'] = pd.to_datetime(df['date'])

    # Create weekday variable to calculate baseline values
    df['weekday'] = df['date'].dt.dayofweek

    # Keep only entries from Feb 1st to Mar 15th
    bl = df[df['date'] < dt.datetime(2020, 3, 16)]

    # Calculate baseline averages for each weekday
    # bl_averages = bl.groupby(['pcod', 'weekday']).agg({'value': np.mean}).reset_index()
    bl_averages = bl.groupby(['weekday']).agg({'value': np.mean}).reset_index()
    
    # Merge bl averages as a column on original df
    # ndf = df.merge(bl_averages, on = ['pcod', 'weekday'],
    #                suffixes = ('', '_bl')).drop('weekday', axis = 1)
    ndf = df.merge(bl_averages, on = ['weekday'],
                  suffixes = ('', '_bl')).drop('weekday', axis = 1)

    return ndf

subs_day_bl = bl_values(subs_day.reset_index())

# subs_day.reset_index()
# subs = bl_values(subs)
# subs_day.reset_index()
# subs_day

In [None]:
subs_day_bl['p_change_bl'] = (subs_day_bl['value'] - subs_day_bl['value_bl'])/subs_day_bl['value_bl']

In [None]:
subs_day_bl.plot(x = 'date', y = 'p_change_bl')


## Exercise 3 - Choropleth

TO DO:
- Convert to admin 
- Make sure column names match

In [None]:
import folium
import pandas as pd
import geopandas as gpd

In [None]:
gdf = gpd.read_file("admin1.geojson")
a1_geo = r'admin1.geojson'

In [None]:
gdf.plot()


In [None]:
# tdf_region = tdf.groupby('region').agg({'value': np.sum}).reset_index()
# tdf_region = tdf_region.rename(columns = {'region': 'pcod'})
foo = pd.read_csv('location_event_counts_admin1_2020-02-05.csv')
foo

In [None]:
gdf.shape

In [None]:
# url = (
#     "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
# )
# state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
# state_data = pd.read_csv(state_unemployment)
# state_geo = f"{url}/us-states.json"
# state_geo

In [None]:
foo


In [None]:

m = folium.Map(location=[7.28, -0.97], zoom_start=7)


folium.Choropleth(
    geo_data=a1_geo,
    name="choropleth",
    data=foo,
    columns=["pcod", "value"],
    key_on="feature.properties.pcod",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.2,
    highlight = True,
    reset = True,
).add_to(m)

# Add control to the side 
folium.LayerControl().add_to(m)

m