In [3]:
import pandas as pd
import numpy as np
from google.cloud import bigquery

In [4]:
%load_ext google.cloud.bigquery

from IPython.display import clear_output
from tqdm import tqdm
import time

import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

sns.set(style='darkgrid', font_scale=1.5)
current_palette = sns.color_palette('deep')


### Preparing the csv file containing Cultural Dimensions for all the countries

***Source:*** The original csv file containing cultural dimension data was downloaded from [GEERT HOFSTEDE](https://geerthofstede.com/research-and-vsm/dimension-data-matrix/) Website.
For more information about this cultural dimensions please refer to its original paper *Hofstede, G.H. (1980), Culture Consequences: International Differences in Work-related. Values, Sage Publications, London.*
*

**List of Cultural Dimensions:**
1. Power Distance Index (PDI)
2. Individualism Index (IDV)
3. Masculinity Index (MAS)
4. Uncertainty Avoidance Index (UAI)
5. Long Term Orientation Index (LTO)
6. Indulgence versus Restraint Index (IVR)


**Important Note:** We modified the original dataset to be able to match the largest group of countries. The modification includes:

1. Modifying the country codes to match with the standard ISO3166-1-Alpha-2 codes.
2. Modifying the cultural dimensions for the following countries:
    1. East Africa Countries: In the original dataset, there exists one row that includes cultural distance for east africa. However, the author also tried to have a more detailed (yet incomplete) version of the cultural dimensions for some of the countries in east africa region. Thus we decided to replace the values of countries in east africa region with the "East Africa" row.
    2. West Africa Countries: Please refer to (2.A.)
    3. Arab Countries: Please refer to (2.A.)
    4. Isreal: Since Isreal could be an important country and only one cultural dimension (Indulgence versus Restraint Index - IVR) is missing, we replaced this missing value with the average of all other countries after removing nulls


In [26]:
data_folder = './data/'
# Loading the initial table
df_cultural = pd.read_csv(data_folder+'Hofstede_6_cultural_dimensions.csv',
                  usecols=['country_code', 'country', 'pdi','idv','mas','uai','lto','ivr'])
# Mapping the country codes to "ISO3166-1-Alpha-2" codes
df_map = pd.read_csv(data_folder+"country_codes.csv", usecols=['ISO3166-1-Alpha-2', 'official_name_en'])\
           .rename(columns={'ISO3166-1-Alpha-2':'iso2'})
df_map = (df_map).drop_duplicates()
df_cultural = pd.merge(df_cultural, df_map, left_on='country', right_on='official_name_en', how='outer')#.drop(columns=['iso3'])
print(df_cultural.shape[0])
display(df_cultural.head(3))

# Saving the resulting dataframe
df_cultural.to_csv(data_folder+'6_cultural_dimensions_processed.csv', index=False, encoding='utf-8')

277


Unnamed: 0,country_code,country,pdi,idv,mas,uai,lto,ivr,iso2,official_name_en
0,AFE,Africa East,64,27,41,52,32,40,,
1,AFW,Africa West,77,20,46,54,9,78,,
2,ALB,Albania,#NULL!,#NULL!,#NULL!,#NULL!,61,15,AL,Albania


## Creating the BigQuery table

In [9]:
# Before running this line, make sure that you have defined the environmental variable...
# ..."GOOGLE_APPLICATION_CREDENTIALS" which points to the JSON file containing authentication key with a valid access key
client = bigquery.Client()

schema_Name_org = [
    bigquery.SchemaField('pdi', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('gdppc', 'FLOAT', 'NULLABLE', None, ())
]

dataset_id = 'data_preparation'
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_org

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table('6_gdppc_country')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/data_preparation/8_cultural_dimensions_processed_null_removed.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x25e76cd3a58>