### Sandbox Notebook

The main purpose of this project is to shed light to the biggest refuge crisis in history over time by story telling with visualization.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import *

In [2]:
unhcr = list(Path('data/unhcr/').iterdir())

In [3]:
unhcr

[PosixPath('data/unhcr/asylum_seekers_monthly.csv'),
 PosixPath('data/unhcr/resettlement.csv'),
 PosixPath('data/unhcr/demographics.csv'),
 PosixPath('data/unhcr/time_series.csv'),
 PosixPath('data/unhcr/asylum_seekers.csv'),
 PosixPath('data/unhcr/persons_of_concern.csv')]

### Data Preprocess

In [5]:
# Data has mixed types
asylum_seekers_monthly = pd.read_csv(unhcr[0])
resettlement = pd.read_csv(unhcr[1])
demographics = pd.read_csv(unhcr[2])
time_series = pd.read_csv(unhcr[3])
asylum_seekers = pd.read_csv(unhcr[4])
persons_of_concern = pd.read_csv(unhcr[5])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# What is trend ? Where is to trend ?
asylum_seekers_monthly.head(2)

Unnamed: 0,Country / territory of asylum/residence,Origin,Year,Month,Value
0,Australia,Afghanistan,1999,January,8
1,Australia,Afghanistan,1999,February,10


In [13]:
asylum_seekers.dtypes

Year                                        int64
Country / territory of asylum/residence    object
Origin                                     object
RSD procedure type / level                 object
Tota pending start-year                    object
of which UNHCR-assisted(start-year)        object
Applied during year                        object
decisions_recognized                       object
decisions_other                            object
Rejected                                   object
Otherwise closed                           object
Total decisions                            object
Total pending end-year                     object
of which UNHCR-assisted(end-year)          object
dtype: object

In [7]:
# Resettlement is the transfer of refugees from an asylum country 
# to another State that has agreed to admit them and ultimately grant them permanent settlement. 
# Who helps for good ?
resettlement.head(2)

Unnamed: 0,Country / territory of asylum/residence,Origin,Year,Value
0,Canada,Albania,1959,17
1,Canada,Austria,1959,17


In [14]:
resettlement.dtypes

Country / territory of asylum/residence    object
Origin                                     object
Year                                        int64
Value                                      object
dtype: object

In [8]:
# yearly flow of type of population
time_series.head(2)

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Population type,Value
0,1951,Australia,Various/Unknown,Refugees (incl. refugee-like situations),180000
1,1951,Austria,Various/Unknown,Refugees (incl. refugee-like situations),282000


In [15]:
time_series.dtypes

Year                                        int64
Country / territory of asylum/residence    object
Origin                                     object
Population type                            object
Value                                      object
dtype: object

In [10]:
# refugee demographics by countries
# Who is affected the most ?
demographics.head(2)

Unnamed: 0,Year,Country / territory of asylum/residence,Location Name,Female 0-4,Female 5-11,Female 5-17,Female 12-17,Female 18-59,Female 60+,F: Unknown,F: Total,Male 0-4,Male 5-11,Male 5-17,Male 12-17,Male 18-59,Male 60+,M: Unknown,M: Total
0,2001,Afghanistan,Kabul,0,,1.0,,1,0,0.0,2,0,,0.0,,2,0,0.0,2
1,2001,Afghanistan,Various,14335,,45451.0,,99880,19234,412004.0,590904,14716,,47522.0,,114965,13025,435492.0,625720


In [16]:
demographics.dtypes

Year                                         int64
Country / territory of asylum/residence     object
Location Name                               object
Female 0-4                                  object
Female 5-11                                 object
Female 5-17                                float64
Female 12-17                                object
Female 18-59                                object
Female 60+                                  object
F: Unknown                                 float64
F: Total                                    object
Male 0-4                                    object
Male 5-11                                   object
Male 5-17                                  float64
Male 12-17                                  object
Male 18-59                                  object
Male 60+                                    object
M: Unknown                                 float64
M: Total                                    object
dtype: object

In [11]:
# Who helps the most ? To whom people ask help ? What helps the most ?
asylum_seekers.head(2)

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year)
0,2000,Zimbabwe,Afghanistan,G / FI,0,0,5,5,0,0,0,5.0,0,0
1,2000,South Africa,Afghanistan,G / FI,8,1,0,0,0,0,0,,8,0


In [17]:
asylum_seekers.dtypes

Year                                        int64
Country / territory of asylum/residence    object
Origin                                     object
RSD procedure type / level                 object
Tota pending start-year                    object
of which UNHCR-assisted(start-year)        object
Applied during year                        object
decisions_recognized                       object
decisions_other                            object
Rejected                                   object
Otherwise closed                           object
Total decisions                            object
Total pending end-year                     object
of which UNHCR-assisted(end-year)          object
dtype: object

In [18]:
# pivot format of time_series
persons_of_concern.head(2)

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,Refugees (incl. refugee-like situations),Asylum-seekers (pending cases),Returned refugees,Internally displaced persons (IDPs),Returned IDPs,Stateless persons,Others of concern,Total Population
0,1951,Australia,Various/Unknown,180000,,,,,,,180000
1,1951,Austria,Various/Unknown,282000,,,,,,,282000


In [19]:
persons_of_concern.dtypes

Year                                          int64
Country / territory of asylum/residence      object
Origin                                       object
Refugees (incl. refugee-like situations)     object
Asylum-seekers (pending cases)               object
Returned refugees                            object
Internally displaced persons (IDPs)         float64
Returned IDPs                               float64
Stateless persons                            object
Others of concern                            object
Total Population                             object
dtype: object

### Filter Data


We will be filtering the data by:

`Origin` == '\*Syria\*' and `Year` > 2010

In [21]:
def filter_data(data):
    if 'Origin' in data.columns:
        data = data[(data.Year > 2010) & (data.Origin.str.contains('Syria'))]
    else:
        data = data[(data.Year > 2010)]
    return data

In [22]:
asylum_seekers_monthly = filter_data(asylum_seekers_monthly)
resettlement = filter_data(resettlement)
time_series = filter_data(time_series)
demographics = filter_data(demographics)
asylum_seekers = filter_data(asylum_seekers)
persons_of_concern = filter_data(persons_of_concern)

In [25]:
asylum_seekers.Year.min(), asylum_seekers.Year.max()

(2011, 2016)

In [26]:
resettlement.Year.min(), resettlement.Year.max()

(2011, 2016)

In [27]:
time_series.Year.min(), time_series.Year.max()

(2011, 2016)

In [28]:
demographics.Year.min(), demographics.Year.max()

(2011, 2016)

In [29]:
persons_of_concern.Year.min(), persons_of_concern.Year.max()

(2011, 2016)

### Correct Data Types

In [30]:
def str2num(data):
    """
    For each object type column check if any entry starts and
    ends with a digit. Replace * with np.nan since it's
    noted * is used to mask confidential information.
    Convert those columns to float type.
    """
    columns = data.select_dtypes(['object']).columns
    for c in columns:
        if data[c].str.contains('^\d*\d$', regex=True).any():
            print(f'Converting column : {c}')
            data[c] = data[c].replace('*', np.nan).astype(np.float)
    return data

In [31]:
asylum_seekers_monthly = str2num(asylum_seekers_monthly)

Converting column : Value


In [32]:
resettlement = str2num(resettlement)

Converting column : Value


In [35]:
time_series = str2num(time_series)

Converting column : Value


In [36]:
demographics = str2num(demographics)

Converting column : Female 0-4
Converting column : Female 5-11
Converting column : Female 12-17
Converting column : Female 18-59
Converting column : Female 60+
Converting column : F: Total
Converting column : Male 0-4
Converting column : Male 5-11
Converting column : Male 12-17
Converting column : Male 18-59
Converting column : Male 60+
Converting column : M: Total


In [37]:
persons_of_concern = str2num(persons_of_concern)

Converting column : Refugees (incl. refugee-like situations)
Converting column : Asylum-seekers (pending cases)
Converting column : Returned refugees
Converting column : Stateless persons
Converting column : Others of concern
Converting column : Total Population


### Save new data

In [38]:
import os

In [39]:
# create new directory for clean data
os.makedirs("data/unhcr_clean", exist_ok=False)

In [42]:
clean_unhcr = "data/unhcr_clean/"

In [47]:
asylum_seekers_monthly.to_csv(clean_unhcr + unhcr[0].name, index=False)
resettlement.to_csv(clean_unhcr + unhcr[1].name, index=False)
time_series.to_csv(clean_unhcr + unhcr[2].name, index=False)
demographics.to_csv(clean_unhcr + unhcr[3].name, index=False)
asylum_seekers.to_csv(clean_unhcr + unhcr[4].name, index=False)
persons_of_concern.to_csv(clean_unhcr + unhcr[5].name, index=False)

## Questions

### How is trend of the Crisis ?

wikipedia link for timeline: https://en.wikipedia.org/wiki/Timeline_of_the_Syrian_Civil_War

If time allowed timelines can be added to the plots

In [89]:
import pdb

In [48]:
import plotly.plotly as py
import plotly.graph_objs as go

In [97]:
def col2dt(data, columns):
    """Convert Year, Month, Day to Pandas Datetime"""
    #pdb.set_trace()
    # extract datetime from Year Month
    data = data.copy()
    date = ""
    for c in columns:
        date += (data[c].astype(str) + "/")
    date = date.apply(lambda x: x[:-1])
    data["date"] = pd.to_datetime(date)
    return data

In [99]:
asylum_seekers_monthly = col2dt(asylum_seekers_monthly, ["Year", "Month"])

In [130]:
asylum_seekers_monthly.Year.min(), asylum_seekers_monthly.Year.max()

(2011, 2017)

#### a) Number Plot

In [100]:
# There is 45 unique countries that refugees seek asylum
asylum_seekers_monthly.iloc[:, 0].nunique()

45

#### Global Trend of Syrian Asylum Seekers 2011-2016

In [104]:
# CREATE DATA
# Total trend over the years of asylum seekers
total_asy_seek =\
    asylum_seekers_monthly.groupby(['date'])['Value'].sum().reset_index(drop=False)
total_asy_seek.sort_values('date', ascending=True, inplace=True)

In [105]:
total_asy_seek.head()

Unnamed: 0,date,Value
0,2011-01-01,470.0
1,2011-02-01,436.0
2,2011-03-01,496.0
3,2011-04-01,461.0
4,2011-05-01,570.0


In [147]:
# create plotly data
plotly_data = [
    go.Scatter(
    x=total_asy_seek.date,
    y=total_asy_seek.Value,
    fill= 'tozeroy')
]

# create custom layout
plotly_layout = go.Layout(
    title = "Global Number of Asylum Seekers Originating From Syria" ,
    
    xaxis=dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    
    yaxis=dict(
        title='Number of Refugees',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

# create figure
fig = dict(data=plotly_data, layout=plotly_layout)

In [148]:
# plot inline
py.iplot(fig)

#### Country Trend of Syrian Asylum Seekers 2011-2016

##### do interacitve map plot over years, time series plot is very crowded and not looking good

In [113]:
COUNTRY = asylum_seekers_monthly.columns[0]

In [114]:
# CREATE DATA
# Total trend over the years of asylum seekers by country
total_country_asy_seek =\
    asylum_seekers_monthly.groupby(['date', COUNTRY])['Value'].sum().reset_index(drop=False)
total_country_asy_seek.sort_values('date', ascending=True, inplace=True)

In [119]:
total_country_asy_seek.head()

Unnamed: 0,date,Country / territory of asylum/residence,Value
0,2011-01-01,Austria,26.0
23,2011-01-01,United Kingdom of Great Britain and Northern I...,6.0
22,2011-01-01,USA (INS/DHS),1.0
21,2011-01-01,USA (EOIR),8.0
20,2011-01-01,Turkey,1.0


In [127]:
total_country_asy_seek.date.min(), total_country_asy_seek.date.max()

(Timestamp('2011-01-01 00:00:00'), Timestamp('2017-05-01 00:00:00'))

In [149]:
# create plotly data

plotly_data = []
uniq_country = total_country_asy_seek[COUNTRY].unique()

for country in uniq_country:
    # filter by country
    country_data = total_country_asy_seek.loc[total_country_asy_seek[COUNTRY] == country, :]
    # sort date by ascending
    country_data.sort_values(by="date", ascending=True, inplace=True)
    
    trace = go.Scatter(
                x=country_data.date,
                y=country_data.Value,
                name = f"{country}",
    #            line = dict(color = '#7F7F7F'),
                opacity = 0.8)
    plotly_data.append(trace)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [154]:
# create custom layout
plotly_layout = go.Layout(
    # title
    title = "Number of Asylum Seekers Originating From Syria by Country" ,
    # x-axis
    xaxis=dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    # y-axis
    yaxis=dict(
        title='Number of Refugees',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

# create figure
fig = dict(data=plotly_data, layout=plotly_layout)

In [155]:
# plot inline
py.iplot(fig)

### Causalities of Civil War

Process data

In [148]:
causalities = pd.read_csv('./data/civil_war_causalities.csv', encoding="ISO-8859-1")

In [154]:
causalities.dtypes

name          object
status        object
gender        object
province      object
birthPlace    object
deathDate     object
deathCause    object
actor         object
dtype: object

In [155]:
causalities.head(2)

Unnamed: 0,name,status,gender,province,birthPlace,deathDate,deathCause,actor
0,Abo Yazan al-Jbailia,Non-Civilian,Adult - Male,Daraa,Jbailieh,2018-04-13,Explosion,Not identified
1,Maher Mohammad al-Ezo al-Shami,Civilian,Child - Male,Hama,Huribnefseh,2018-04-13,Shelling,Syrian government and affiliated militias


In [156]:
causalities.deathCause.unique()

array(['Explosion', 'Shelling', 'Shooting', 'Warplane shelling',
       'Field Execution', 'Chemical and toxic gases',
       'Detention - Torture', 'Kidnapping - Execution', 'Other',
       'Unknown', 'Un-allowed to seek Medical help', 'Siege',
       'Detention - Execution', 'Kidnapping - Torture',
       'Kidnapping - Torture - Execution',
       'Detention - Torture - Execution', nan], dtype=object)

In [152]:
causalities.groupby('actor').count()

Unnamed: 0_level_0,name,status,gender,province,birthPlace,deathDate,deathCause
actor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Al-Nusra Front,221,221,221,221,174,221,221
Armed opposition groups,1914,1915,1915,1915,1616,1915,1915
International coalition forces,2190,2191,2191,2191,1793,2191,2191
Not identified,3462,3464,3464,3464,3067,3464,3464
Russian troops,3871,3873,3873,3873,3577,3873,3873
Self administration forces,599,599,599,599,489,599,599
Syrian government and affiliated militias,20603,20613,20613,20613,19262,20608,20613
The organization of the Islamic State in Iraq and the Levant - ISIS,6718,6720,6720,6720,5647,6720,6720


In [153]:
causalities.groupby(['actor', 'gender']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,status,province,birthPlace,deathDate,deathCause
actor,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Al-Nusra Front,Adult - Female,6,6,6,5,6,6
Al-Nusra Front,Adult - Male,212,212,212,168,212,212
Al-Nusra Front,Child - Male,3,3,3,1,3,3
Armed opposition groups,Adult - Female,150,150,150,127,150,150
Armed opposition groups,Adult - Male,1548,1549,1549,1302,1549,1549
Armed opposition groups,Child - Female,68,68,68,56,68,68
Armed opposition groups,Child - Male,148,148,148,131,148,148
International coalition forces,Adult - Female,343,343,343,310,343,343
International coalition forces,Adult - Male,1363,1363,1363,1042,1363,1363
International coalition forces,Child - Female,188,188,188,169,188,188


In [150]:
causalities.groupby(['status', 'gender']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,province,birthPlace,deathDate,deathCause,actor
status,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Civilian,Adult - Female,16927,16931,15988,16930,16931,3642
Civilian,Adult - Male,107974,108009,98628,107997,108009,17791
Civilian,Child - Female,8733,8735,8335,8730,8735,2011
Civilian,Child - Male,17583,17588,16603,17587,17588,3688
Non-Civilian,Adult - Female,30,30,30,30,30,20
Non-Civilian,Adult - Male,60151,60164,52812,60155,60164,12376
Non-Civilian,Child - Female,2,2,2,2,2,2
Non-Civilian,Child - Male,448,448,405,448,448,66
