## Preliminaries

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
geolocator = Nominatim()

## Clean MyWorld Survey Data

In [2]:
xls_file = pd.ExcelFile('raw_data/latestmyworld-women_2016_07_05_15_44_10.xlsx')

In [3]:
xls_df = xls_file.parse(xls_file.sheet_names[0])

In [4]:
df = pd.DataFrame()

In [None]:
#q1
df['Belief In Earning Opportunity Equality'] = xls_df['q1']
#q2
df['Belief In Pay Equality'] = xls_df['q2']

#q3/1
df['Education Helps Women'] = xls_df['q3/1']
#q3/2
df['Role Models Help Women'] = xls_df['q3/2']
#q3/3
df['Family Planning Helps Women'] = xls_df['q3/3']
#q3/4
df['Maternity Pay And Childcare Helps Women'] = xls_df['q3/4']
#q3/5
df['Minimum Wage Help Women'] = xls_df['q3/5']
#q3/6
df['Access To Banking Helps Women'] = xls_df['q3/6']
#q3/7
df['Womens Groups And Charities Helps Women'] = xls_df['q3/7']
#q3/a
df['What Helps Women?'] = xls_df['q3a']

#q4/1
df['Fear Of Violence Is A Barrier'] = xls_df['q4/1']
#q4/2
df['Laws Against Women Working Are A Barrier'] = xls_df['q4/2']
#q4/3
df['Family Responsibilities Are A Barrier'] = xls_df['q4/3']
#q4/4
df['Lack Of Experience Or Skills Is A Barrier'] = xls_df['q4/4']
#q4/5
df['Traditional Views Of Women Are A Barrier'] = xls_df['q4/5']
#q4/6
df['Lack Of ICT Access Is A Barrier'] = xls_df['q4/6']
#q4/a
df['What Are Barriers To Women?'] = xls_df['q4a']

#q5
df['Strength Of Belief That Government Is Helping Women'] = xls_df['q5']
#q6
df['What Helped You The Most?'] = xls_df['q6']

#gender
df['Gender'] = xls_df['respondent/gender']

#education
df['Education'] = xls_df['respondent/edu']

#age
df['Age'] = xls_df['respondent/age']

#country
df['Country'] = xls_df['respondent/country']

#city
df['City'] = xls_df['respondent/qd5']

#location
df['Location'] = np.nan

In [None]:
# Geolocate (city, country). If the city doesn't exist, just geolocate the country.

geo = []

for i in list(zip(df['City'], df['Country'])):
    if type(i[0]) == float:
        try:
            loc = geolocator.geocode(i[1])
            geo.append((loc.latitude, loc.longitude))
        except:
            geo.append(np.nan)
    else:
        try:
            location = ', '.join(i)
            loc = geolocator.geocode(location)
            geo.append((loc.latitude, loc.longitude))
        except:
            geo.append(np.nan)
            
df['Location'] = geo

In [None]:
def true_false_converter(x):
    if x == True:
        return 'Yes'
    elif x == 'yes':
        return 'Yes'
    elif x == False:
        return 'No'
    elif x == 'no':
        return 'No'
    elif x == 'dn':
        return np.nan
    else:
        return x

df[df.columns[0:16]] = df[df.columns[0:16]].applymap(true_false_converter)

In [None]:
capitalizer = lambda x: x.title()

df['Gender'] = df['Gender'].map(capitalizer)

In [None]:
def education_converter(x):
    if x == 1:
        return 'Some Primary School'
    elif x == 2:
        return 'Finished Primary School'
    elif x == 3:
        return 'Finished Secondary School'
    elif x == 4:
        return 'Beyond Secondary School'
    
df['Education'] = df['Education'].map(education_converter)

In [None]:
df.drop(['City', 'Country', 'Location'], axis=1).to_csv('clean_data/survey_data.csv', index=True)

In [None]:
df[df.columns[22:]].to_csv('clean_data/survey_location_data.csv', index=True)

In [None]:
len(df.columns.values) * 5623

## Clean UReport Data

In [20]:
ur_df = pd.read_csv('raw_data/UN Women poll results - Hoja 1.csv', header=1)

In [21]:
ur_df = ur_df.rename(index=str, columns={"Unnamed: 0": "Country", 
                                         "Unnamed: 1": "Date",
                                         "Unnamed: 2": "Total Polled",
                                         "Unnamed: 3": "Responded",
                                         "%YES": "Believes In Pay Equity",
                                         "%NO": "Does Not Belive In Pay Equity",
                                         "Fear of violence %": "Fear Of Violence Is A Barrier",
                                         "Legal obstacles %": "Laws Against Women Working Is A Barrier",
                                         "% Family responsibilities ": "Family Responsibilities Are A Barrier",
                                         "Lack of skills %": "Lack Of Experience Or Skills Is A Barrier",
                                         "Traditional views of women": "Traditional Views Of Women Are A Barrier",
                                         "Lack of digital connectivity": "Lack Of ICT Access Is A Barrier",
                                         "Your own": "Other Things Are Barriers",
                                         "Good education": "Education Helps Women",
                                         "Women rolemodels": "Role Models Help Women",
                                         "Family planning & contraception": "Family Planning Helps Women",
                                         "Maternity pay & childcare": "Maternity Pay And Childcare Helps Women",
                                         "access to banking": "Access To Banking Helps Women",
                                         "women's groups": "Womens Groups And Charities Helps Women",
                                         "your own": "Other Things Help Women",
                                         }
                    )

In [22]:
def string_to_float(x):
    if type(x) != float:
        return x.replace('.', '').replace(',', '.')
    else:
        return x

ur_df['Total Polled'] = [string_to_float(x) for x in ur_df['Total Polled']]
ur_df['Responded'] = [string_to_float(x) for x in ur_df['Responded']]

In [23]:
ur_df = ur_df[ur_df.Country != 'Global']
ur_df = ur_df[ur_df.Country != 'Central African Republic']
ur_df

Unnamed: 0,Country,Date,Total Polled,Responded,Believes In Pay Equity,Does Not Belive In Pay Equity,Fear Of Violence Is A Barrier,Laws Against Women Working Is A Barrier,Family Responsibilities Are A Barrier,Lack Of Experience Or Skills Is A Barrier,Traditional Views Of Women Are A Barrier,Lack Of ICT Access Is A Barrier,Other Things Are Barriers,Education Helps Women,Role Models Help Women,Family Planning Helps Women,Maternity Pay And Childcare Helps Women,Access To Banking Helps Women,Womens Groups And Charities Helps Women,Other Things Help Women
0,Swaziland,29.06.16,6447.0,2078.0,57%,43%,11%,5%,39%,14%,24%,3%,4%,52%,11%,12%,7%,2%,11%,5%
1,Guinea,28.06.16,7126.0,3493.0,72%,27%,12%,4%,52%,13%,12%,1%,6%,49%,14%,13%,4%,4%,12%,3%
2,Brazil,29.6.16,2353.0,339.0,18%,82%,23%,13%,28%,3%,21%,1%,11%,37%,12%,16%,16%,3%,15%,1%
3,Burkina Faso,29.06.2016,11238.0,6513.0,76%,23%,16%,5%,42%,6%,21%,1%,8%,31%,9%,12%,3%,23%,17%,5%
4,Cameroon,29.06.16,11789.0,3459.0,69%,27%,12%,5%,48%,10%,15%,2%,8%,47%,19%,12%,4%,4%,9%,5%
5,Ireland,28.06.2016,175.0,36.0,81%,19%,6%,0%,26%,0%,61%,0%,3%,37%,13%,10%,27%,0%,3%,10%
6,Mexico,29.06.16,5364.0,1387.0,22%,78%,9%,6%,17%,3%,59%,0%,6%,28%,5%,5%,16%,29%,9%,8%
7,Ukraine,22.06.16,6530.0,1844.0,45%,55%,5%,4%,53%,7%,21%,1%,5%,30%,17%,12%,20%,1%,11%,6%
8,Zimbabwe,28.06.16,24143.0,6574.0,77%,22%,15%,11%,37%,9%,19%,3%,4%,48%,15%,6%,10%,4%,12%,4%
9,Liberia,29.06.16,86816.0,12139.0,67%,33%,22%,12%,30%,25%,6%,2%,4%,73%,6%,6%,7%,3%,3%,2%


## Add Extra Columns

In [24]:
Income = {'Swaziland':'Middle',
          'Guinea':'Low',
          'Brazil':'Middle',
          'Burkina Faso':'Low',
          'Cameroon':'Middle',
          'Ireland':'Very High',
          'Mexico':'High',
          'Ukraine':'Middle',
          'Zimbabwe':'Low',
          'Liberia':'Low',
          'Pakistan':'Middle',
          'Malaysia':'Middle',
          'Indonesia':'Middle',
          'Nigiera':'Middle',
         }

Region = {'Swaziland':'Sub-Saharan Africa',
          'Guinea':'Sub-Saharan Africa',
          'Brazil':'Latin America and the Caribbean',
          'Burkina Faso':'Sub-Saharan Africa',
          'Cameroon':'Sub-Saharan Africa',
          'Ireland':'Europe',
          'Mexico':'Central America',
          'Ukraine':'Eastern Europe and central Asia',
          'Zimbabwe':'Sub-Saharan Africa',
          'Liberia':'Sub-Saharan Africa',
          'Pakistan':'Asia',
          'Malaysia':'Asia',
          'Indonesia':'South East Asia',
          'Nigiera':'Sub-Saharan Africa',
         }

df_income = pd.DataFrame()

df_income = df_income.from_dict(Income, orient='index')
df_income['Income Level'] = df_income[0]
df_income = df_income.drop(df_income.columns[[0]], axis=1)
ur_df = ur_df.merge(df_income, left_on='Country', right_index=True)

df_region = df_income.from_dict(Region, orient='index')
df_region['Region'] = df_region[0]
df_region = df_region.drop(df_region.columns[[0]], axis=1)
ur_df = ur_df.merge(df_region, left_on='Country', right_index=True)
ur_df

Unnamed: 0,Country,Date,Total Polled,Responded,Believes In Pay Equity,Does Not Belive In Pay Equity,Fear Of Violence Is A Barrier,Laws Against Women Working Is A Barrier,Family Responsibilities Are A Barrier,Lack Of Experience Or Skills Is A Barrier,...,Other Things Are Barriers,Education Helps Women,Role Models Help Women,Family Planning Helps Women,Maternity Pay And Childcare Helps Women,Access To Banking Helps Women,Womens Groups And Charities Helps Women,Other Things Help Women,Income Level,Region
0,Swaziland,29.06.16,6447.0,2078.0,57%,43%,11%,5%,39%,14%,...,4%,52%,11%,12%,7%,2%,11%,5%,Middle,Sub-Saharan Africa
1,Guinea,28.06.16,7126.0,3493.0,72%,27%,12%,4%,52%,13%,...,6%,49%,14%,13%,4%,4%,12%,3%,Low,Sub-Saharan Africa
2,Brazil,29.6.16,2353.0,339.0,18%,82%,23%,13%,28%,3%,...,11%,37%,12%,16%,16%,3%,15%,1%,Middle,Latin America and the Caribbean
3,Burkina Faso,29.06.2016,11238.0,6513.0,76%,23%,16%,5%,42%,6%,...,8%,31%,9%,12%,3%,23%,17%,5%,Low,Sub-Saharan Africa
4,Cameroon,29.06.16,11789.0,3459.0,69%,27%,12%,5%,48%,10%,...,8%,47%,19%,12%,4%,4%,9%,5%,Middle,Sub-Saharan Africa
5,Ireland,28.06.2016,175.0,36.0,81%,19%,6%,0%,26%,0%,...,3%,37%,13%,10%,27%,0%,3%,10%,Very High,Europe
6,Mexico,29.06.16,5364.0,1387.0,22%,78%,9%,6%,17%,3%,...,6%,28%,5%,5%,16%,29%,9%,8%,High,Central America
7,Ukraine,22.06.16,6530.0,1844.0,45%,55%,5%,4%,53%,7%,...,5%,30%,17%,12%,20%,1%,11%,6%,Middle,Eastern Europe and central Asia
8,Zimbabwe,28.06.16,24143.0,6574.0,77%,22%,15%,11%,37%,9%,...,4%,48%,15%,6%,10%,4%,12%,4%,Low,Sub-Saharan Africa
9,Liberia,29.06.16,86816.0,12139.0,67%,33%,22%,12%,30%,25%,...,4%,73%,6%,6%,7%,3%,3%,2%,Low,Sub-Saharan Africa


In [25]:
ur_df.to_csv('clean_data/ureport_country_data.csv')

## Create Aggregate Ranking

In [26]:
barriers = ["Fear Of Violence Is A Barrier",
            "Laws Against Women Working Is A Barrier",
            "Family Responsibilities Are A Barrier",
            "Lack Of Experience Or Skills Is A Barrier",
            "Traditional Views Of Women Are A Barrier",
            "Lack Of ICT Access Is A Barrier",
            "Other Things Are Barriers"]

In [27]:
meaner = lambda x: x.str.replace('%','').astype(int).mean()

In [28]:
ur_df_barriers_ranked = ur_df[barriers].apply(meaner).to_frame(name='Average Percent Agreed')

In [33]:
ur_df_barriers_ranked['Barrier'] = ur_df_barriers_ranked.index
ur_df_barriers_ranked

Unnamed: 0,Average Percent Agreed,Barrier
Fear Of Violence Is A Barrier,14.285714,Fear Of Violence Is A Barrier
Laws Against Women Working Is A Barrier,6.071429,Laws Against Women Working Is A Barrier
Family Responsibilities Are A Barrier,37.928571,Family Responsibilities Are A Barrier
Lack Of Experience Or Skills Is A Barrier,8.357143,Lack Of Experience Or Skills Is A Barrier
Traditional Views Of Women Are A Barrier,25.642857,Traditional Views Of Women Are A Barrier
Lack Of ICT Access Is A Barrier,1.428571,Lack Of ICT Access Is A Barrier
Other Things Are Barriers,5.428571,Other Things Are Barriers


In [34]:
ur_df_barriers_ranked.sort_values('Average Percent Agreed', ascending=False).to_csv('clean_data/barriers_ranked.csv')