In [1]:
import numpy as np
import matplotlib.pyplot as plt

def convert_float(dataframe):
  '''
  INPUT
    dataframe - any dataframe acceptable.
    col - local variable, used to go through all columns in the dataframe.
  OUTPUT
    dataframe - global varable. The input dataframe is replaced in the function. If the following 
    logic is applicable on a column level.  

  The function will cycle through each column in the dataframe and replace any dashes or spaces
  with the NaN value definition from the numpy library. If the column is numeric in nature but
  formatted as an object it will change it's datatype to a float. If the column is naturally a 
  string then the function will print "Could not convert", followed by the column name and move
  on to the next column.

  While debugging this function and viewing the dataset, at first look I believed all blanks in 
  the data were denoted by a '-', however one factor - that was a scored factor - was failing 
  (ef_regulation_labor_dismissal), and the currently commented out section identified the row in 
  that column that failed and that it was an empty space. 

  Therefore, if a column that you expect to able to turn into a float fails, then you can 
  uncomment the green section and rerun the function to get the row number that is failing, and 
  subsequently find out why it failed. I've kept it commented out because if the whole column
  fails for another reason then it will print every row of that column, so it's best used when
  a problem column has already been identified.   
  '''


  for col in dataframe:
    try:
      dataframe[col] = dataframe[col].replace(['-', ' '], np.NaN)
      dataframe[col] = dataframe[col].astype(float)
    except:
      # uncomment the following three lines to debug a failing column that should be able convert.
      # for row in dataframe[col]:
      #   if not isinstance(row, float):
      #     print(row)
      print('Could not convert ' + col)


def most_missing_col(dataframe, threshold):
  '''
  INPUT
    dataframe - the processed dataframe, with NaNs.
    threshold - the threshold for percenatge of missing values.
  OUTPUT
    missing_col_list - list of variables that have more missing values than the threshold identified.

  The function will create a list of variables from the dataframe that have a greater percentage of
  missing values than provided by the threshold input. 
  '''

  missing_col_list = list(dataframe.columns[dataframe.isnull().mean() > threshold])

  return missing_col_list


def country_per_region(selected_region, response, dataframe):
  '''
  INPUT
    selected_region - any region defined by the dataset
    response - which variable you want to take the mean of over the region
    dataframe - the dataset that contains Region
  OUTPUT
    out_data - the aggregated mean of the reponse variable over the selected region.

  This function works well to loop over multiple regions, or variables to get the mean
  of that variable for each region. It can be called inside a loop. But works fine
  on its own.

'''

  out_data = dataframe[dataframe["region"] == selected_region].groupby(["countries"]).mean()[response]

  return out_data


def plot_save_Region_correlation(var_1, var_2, region):
  '''
  INPUT
    var_1 - an input variable for plotting
    var_2 - an input varaible for plotting 
    region - for a selected region
  OUTPUT
    saved plot. no global returned variable.

  This function produces and saves a graph to compare two variables for a filtered region.

  '''
  save_name = "women's_freedom_correlated_with_freedom_score_for_" + region

  plt.figure(figsize = (10,8))
  plt.subplots_adjust(bottom = 0.2)

  plt.plot(var_1, linestyle = 'solid', color = 'r')
  plt.plot(var_2, linestyle = 'solid', color = '#4b0082')
  plt.ylabel("variable score")
  plt.xlabel("country")
  plt.xticks(rotation = 90)
  plt.legend(("Human Freedom Score", "Personal woman's score"), loc = 'lower right')
  plt.grid(True)

  plt.title("The average personal women's freedom score aggregated for the top correlated factors. \n Region = " + region)

  plt.savefig('./results/' + save_name + '.png')
  plt.close()


def correlations_top_bottom(dataframe, region_select, var, n):
  '''
  INPUT
    dataframe - the proccessed dataframe, must contain region.
    region_select - the selected region
    var - the response variable to check correlations against
    n - number of variables to print
  OUTPUT
    Print top - print the highest correlated factors
    Print bottom - print the highest anti-correlated factors

  This function prints out the highest correlated and anti-correlated factors compared with a given variable.
  The number of highest factors printed is given by n.

  '''

  correlations = dataframe[dataframe["region"] == region_select].corr()
  sorted_corr = correlations[var].sort_values()

  Top = sorted_corr[-n:]
  print(Top)
  Bottom = sorted_corr[:n]
  print(Bottom)


def variable_per_region(selected_region, var, response, dataframe):
  '''
  INPUT
    selected_region - region of interest
    var - correlated variable of interest
    response - the orginal variable it was checked for correlations against. 
  OUTPUT
    out_data - the relationship between the correlated pair for a selected region.

  This function gives the mean value of the correlated factor to the response factor for a
  given region so that they can be plotted before potential modelling.

'''

  out_data = dataframe[dataframe["region"] == selected_region].groupby([var]).mean()[response]

  return out_data



In [2]:
'''
RUN file for Udacity Data science nanodegree: Project One - Write a Data Science Blog Post
Author: Hannah Costa

The data chosen for this assigmnet was the Human Freedom Index data avaialble on Kaggle.
LINK: https://www.kaggle.com/gsutters/the-human-freedom-index

The Human Index data combines measures of economic and personal freedom to assign a total 
Human Freedom score to a country and rank it based on this score. The survey has been 
conducted since 2008 for the earliest definition, and the most recent data availiable is 
from 2019, which is used in this assignment. 

The data available isn't the raw data from the Freedom project, and is made up of the economic 
and personal freedom factors that are all on a scored scale between 0-10. The Human Freedom 
report offers some insight in the meanings behind these factors but their definitions, 
or what the score directly translates to numerically is unknown.

For example the pf_ss_women_fgm factor relates to Female Genital Mutilations in a country, but 
the score doesn't tell me if it's based on number of cases reported per population percentage.
'pf' in a factor means that it's used to create a Personal Freedom Factor and 'ss' means it's 
in the 'Security and Safety' category. 

The freedom indicators covered by the report are:

Rule of Law
Security and Safety
Movement
Religion
Association, Assembly, and Civil Society
Expression and Information
Identity and Relationships
Size of Government
Legal System and Property Rights
Access to Sound Money
Freedom to Trade Internationally
Regulation of Credit, Labor, and Business

This Run file will call functions and scripts with the purpose of reading in the data and 
processing it so it can be used to explore correlations and insights in the data while
focusing on the implications of women's freedom. The author is using python 3.8.5 in Jupyter notebook.

'''
# Call the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns


# Read dataset into a dataframe
df = pd.read_csv('./data/datasets_93172_883723_hfi_cc_2019.csv')
# Check the first handful of rows to ensure the data was retrieved. note: 120 columns expected
print(df.head())


   year ISO_code  countries                         region hf_score hf_rank  \
0  2017      ALB    Albania                 Eastern Europe     7.84      38   
1  2017      DZA    Algeria     Middle East & North Africa     4.99     155   
2  2017      AGO     Angola             Sub-Saharan Africa      5.4     151   
3  2017      ARG  Argentina  Latin America & the Caribbean     6.86      77   
4  2017      ARM    Armenia        Caucasus & Central Asia     7.42      54   

  hf_quartile pf_rol_procedural pf_rol_civil pf_rol_criminal  ...  \
0           1               6.7          4.5             4.7  ...   
1           4                 -            -               -  ...   
2           4                 -            -               -  ...   
3           2               7.1          5.8             4.3  ...   
4           2                 -            -               -  ...   

  ef_regulation_business_adm ef_regulation_business_bureaucracy  \
0                        6.3               

In [3]:
# Because of the processed nature of the data set I want to drop some of the factors that
# are generated by the more base factors.
df = df.drop(["ISO_code", "ef_score","pf_score", "pf_rank", "ef_rank", "hf_rank", "hf_quartile"], axis = 1)
# print(df.dtypes)

In [4]:
# The whole dataset comes in as "objects" despite the majority of the data being a numerical score. 
# The next step would be to convert the columns that can be converted to floats in order to be able
# to perform numerical analysis on the dataset. 

convert_float(df)

Could not convert countries
Could not convert region


In [5]:

# create a list of columns that have more than 25% of their values missing.
col_list = most_missing_col(df, 0.25)

# Check column list to be aware of what's being dropped
print(col_list)

# Drop the columns that have the most missing values
df = df.drop(col_list, axis=1)

['pf_rol_procedural', 'pf_rol_civil', 'pf_rol_criminal', 'pf_ss_women_inheritance_widows', 'pf_ss_women_inheritance_daughters', 'pf_religion_estop_establish', 'pf_religion_estop_operate', 'pf_association_political_establish', 'pf_association_political_operate', 'pf_association_prof_establish', 'pf_association_prof_operate', 'pf_association_sport_establish', 'pf_association_sport_operate', 'pf_identity_legal']


In [6]:
Region_list = list(set(df["region"]))

# Check that region_list has populated correctly
# print(Region_list)

# Loop over region_list for average scores for hf_score and pf_ss_women score. Plot togther for 
# each country per region. Save the figuers so they do not overwrite each other. 

for region in Region_list:
  region_pf_women = country_per_region(region, "pf_ss_women", df)
  region_hf_score = country_per_region(region, "hf_score", df)
  plot_save_Region_correlation(region_hf_score, region_pf_women, region)

In [7]:
print(correlations_top_bottom(df, "Sub-Saharan Africa", "pf_ss_women", 10))

ef_legal                         0.271014
ef_legal_protection              0.303193
ef_regulation_business_bribes    0.305410
ef_legal_military                0.309798
ef_legal_judicial                0.360162
hf_score                         0.380304
pf_ss                            0.637599
pf_ss_women_inheritance          0.759284
pf_ss_women_fgm                  0.802604
pf_ss_women                      1.000000
Name: pf_ss_women, dtype: float64
ef_government_transfers       -0.349969
ef_trade_tariffs_sd           -0.246652
ef_government_consumption     -0.231973
ef_regulation_labor_firing    -0.200293
ef_legal_integrity            -0.194131
ef_regulation_labor_bargain   -0.151508
pf_association_assembly       -0.114447
pf_association_political      -0.099305
ef_legal_enforcement          -0.090885
ef_government_soa             -0.074465
Name: pf_ss_women, dtype: float64
None


In [8]:

# Closer look at the correlation for Sub Saharan Africa
var_list = ["ef_legal_judicial", "ef_legal_military", "ef_regulation_business_bribes", "ef_government_transfers", "ef_trade_tariffs_sd", "ef_government_consumption"]

plt.figure(figsize = (10,8))
plt.xlabel("variable score")
plt.ylabel("mean pf_ss_women score")
plt.title("The average personal women's freedom score aggregated for the top correlated factors. \n Sub-Saharan Africa")
plt.grid(True)

d = {}
for var in var_list:
  d[var] = variable_per_region("Sub-Saharan Africa", var, "pf_ss_women", df)
  plt.plot(d[var], 'o')

plt.legend((var_list), loc ='lower right')
plt.savefig('./results/scatter_correlations_SSA.png')

plt.close()

In [9]:
#  Can you model womens's freedomn using only economic factors per region.
df_modelling = df.dropna(subset = ["pf_ss_women"], axis = 0)

X = df_modelling.filter(regex = '^ef', axis = 1)
y = df_modelling["pf_ss_women"]

fill_mean = lambda col: col.fillna(col.mean())

# I've kept the following function inside the run_file instead of the function_file to toggle the different print functions on and off when debugging.
for col in X:
  if X[col].isnull().sum() == 0:
    # print(col + " no NaN's")
    continue
  else:
    try:
      X = X.apply(fill_mean, axis = 0)
      print(col + " worked as expected")
    except:
      print(col + " did not replace NaN with mean")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 15)
lm_model = LinearRegression(normalize = True)
lm_model.fit(X_train, y_train)

Score = str(lm_model.score(X_test, y_test, sample_weight = None))

print("The Model performed with a Score of " + Score)

ef_government_transfers worked as expected
The Model performed with a Score of 0.5401751775708902
