In [118]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re, os, pickle

In [119]:
dataset_folder = "C:\\Users\\Marios\\Documents\\GitHub\\Thomas Prosser project\\datasets\\"
output_folder = "C:\\Users\\Marios\\Documents\\GitHub\\Thomas Prosser project\\output\\"

In [120]:
def get_dta_header_labels(filepath):

    reader = pd.io.stata.StataReader( filepath )
    header = reader.variable_labels()
    labels = reader.value_labels()
    return header,labels

In [376]:
group_to_type_dict = {'party_names':'obs_vars','country_variables':'obs_vars','observation_variables':'obs_vars',
 'party_id':'obs_vars',
 'source_variables':'dataset_vars',
 'party_electoral':'elec_vars',
 'dummied_variables':'dummy_vars',
 'dummiable_variables':'dropped',
 'policy_dimensions':'expert_op', 'ideological_dimensions':'expert_op',
       'eu_policy_questions':'expert_op', 'eu_integration_admission':'expert_op',
       'leadership_divisions':'expert_op', 'ukraine_specific_questions':'expert_op',
       'party_characteristics':'expert_op', 'eu_membership_requirements':'expert_op',
       'immigration_integration_pos':'man_cod',
 'index_jahn_lr':"id_dim",'index_jahn_gg':"id_dim",'index_jahn_oberst_cohesion':"id_dim",
 'index_franzmann_kaiser':"id_dim",'index_budge':"id_dim",'adhoc_index':"id_dim",
        "man_cod":"man_cod","coder_ratings":"coder_ratings",
        "computed_values":"id_dim","parlgov_index":"id_dim",
        "notes":"notes",
                           
                           }

In [377]:
partyfacts_core_parties = pd.read_csv(dataset_folder+"PartyFacts"+os.sep+ "partyfacts-core-parties.csv")
partyfacts_external_parties = pd.read_csv(dataset_folder+"PartyFacts"+os.sep+ "partyfacts-external-parties.csv")
partyfacts_countries = pd.read_csv(dataset_folder+"PartyFacts"+os.sep+ "partyfacts-countries.csv")
partyfacts_datasets = pd.read_csv(dataset_folder+"PartyFacts"+os.sep+ "partyfacts-datasets.csv")

pf_country_to_code = partyfacts_countries[["country","name"]].set_index("name")["country"].apply(lambda x: x.lower())

In [378]:
shared_columns = ["partyfacts_id","year","edate","dataset","dataset_index","country"]
shared_description = {"partyfacts_id":"PartyFacts party id","year":"observative year","edate":"observation full date",
                      "dataset":"Source dataset (partyfacts schema when present)","dataset_index":"Source dataset row index","country":"ISO 3166-1 alpha-3 country codes"}

In [379]:
CHES_comb = pd.read_pickle(dataset_folder+"CHES"+os.sep+"CHES_comb_linked.zip",compression='zip')

CHES_comb[[x for x in CHES_comb.columns if "intgroup__" in x]] = CHES_comb[[x for x in CHES_comb.columns if "intgroup__" in x]].astype(float)
CHES_comb[["year","partyfacts_id"]].isnull().sum(),CHES_comb.shape

dataset = "ches"
ds_brac = "("+dataset+")"
CHES_comb["dataset"] = dataset
CHES_comb["dataset_index"] = CHES_comb.index

CHES_comb = CHES_comb.rename(columns={x:ds_brac+x for x in CHES_comb.columns if x not in shared_columns})

CHES_vl = pd.read_csv(dataset_folder+"CHES"+os.sep+"CHES_comb_variable_list.csv")
CHES_vl.columns = ["variable","group"]
    
CHES_vl["type"] = CHES_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
CHES_vl.loc[CHES_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"

CHES_vl["integrated_variable"] = CHES_vl.apply(lambda x: ds_brac+x["variable"] if x["type"] != "cross-dataset" else x["variable"], axis=1)

description = shared_description.copy()
CHES_vl["description"] = CHES_vl["variable"].apply(lambda x: description[x] if x in description.keys() else np.nan )

CHES_vl.loc[CHES_vl.index.max()+1] = ["dataset_index","source_variables","dataset_vars","dataset_index",np.nan]


In [380]:
manifesto_project = pd.read_pickle(dataset_folder+"CMP"+os.sep+ "manifesto_project_linked.zip",compression='zip')

manifesto_project["year"] = manifesto_project["edate"].dt.year

cmp_country_replace_dict = {"North Macedonia":"Macedonia","Bosnia-Herzegovina":"Bosnia and Herzegovina","South Korea":"Korea (South)"}
manifesto_project["country"] = manifesto_project["countryname"].replace(cmp_country_replace_dict).apply(lambda x: pf_country_to_code.loc[x])

cmp_family_replace_dict = {'eco ecologist':'green', 'lef socialist or other left':'rad left', 'soc social democratic':'socialist',
       'lib liberal':'liberal', 'chr christian democrat':'christdem', 'con conservative':'cons',
       'nat nationalist':'rad right', 'agr agrarian':'agrarian', 'eth ethnic-regional':'regional',
       'sip special issue':'special issue', 'div divers alliance':'coalition', 'mi missing information':np.nan}

# for more consistency with other datasets
manifesto_project = manifesto_project.rename(columns={"parfam":"family"})

family_col = "family"
manifesto_project[family_col] = manifesto_project[family_col].replace(cmp_family_replace_dict)
dummies = pd.get_dummies(manifesto_project[family_col] ,prefix=family_col, prefix_sep='__')
dummies.loc[manifesto_project[family_col].isnull(),:] = np.nan
manifesto_project[dummies.columns] = dummies



dataset = "manifesto"
ds_brac = "("+dataset+")"
manifesto_project["dataset"] = dataset
manifesto_project["dataset_index"] = manifesto_project.index
manifesto_project = manifesto_project.rename(columns={x:ds_brac+x for x in manifesto_project.columns if x not in shared_columns})

manifesto_project[["year","partyfacts_id"]].isnull().sum(),manifesto_project.shape
# could dummy political families, normalise numerical columns

(year             0
 partyfacts_id    6
 dtype: int64,
 (4582, 188))

In [381]:
cmp_vl = pd.DataFrame(columns = ["variable","group","type","integrated_variable","description"])
cmp_vl["integrated_variable"] = manifesto_project.columns
cmp_vl["variable"] = cmp_vl["integrated_variable"].apply(lambda x: x.replace(ds_brac,""))

cmp_var_to_group_dict = {
       'country':'country_variables', 'countryname':'country_variables', 'oecdmember':'country_variables', 'eumember':'country_variables',
       'edate':'observation_variables', 'date':'observation_variables',
       'party':'party_id', 'partyname':'party_names', 'partyabbrev':'party_names',
       'family':'dummiable_variables', 'coderid':'observation_variables',
       'manual':'observation_variables', 'coderyear':'observation_variables', 'testresult':'observation_variables', 'testeditsim':'observation_variables',
       'pervote':'party_electoral', 'voteest':'party_electoral', 'presvote':'party_electoral', 'absseat':'party_electoral', 'totseats':'party_electoral',
       'progtype':'observation_variables',  'datasetorigin':'observation_variables', 'corpusversion':'observation_variables', 'total':'observation_variables',
       'rile':'computed_values', 'planeco':'computed_values', 'markeco':'computed_values', 'welfare':'computed_values', 'intpeace':'computed_values',
       'datasetversion':'source_variables', 'partyfacts_id':'party_id', 'year':'observation_variables', 'dataset':'source_variables',
       'dataset_index':'source_variables',
 }

cmp_var_to_group_dict.update({x.replace(ds_brac,""):"man_cod" for x in manifesto_project.columns if re.search("peruncod|per\d+",x)})
cmp_var_to_group_dict.update({x.replace(ds_brac,""):"dummied_variables" for x in manifesto_project.columns if "family__" in x})

# pf_core_vl["description"] = pf_core_vl["variable"].apply(lambda x: pf_core_var_description[x] if x in pf_core_var_description.keys() else np.nan )    
cmp_vl["group"] = cmp_vl["variable"].apply(lambda x: cmp_var_to_group_dict[x] if x in cmp_var_to_group_dict.keys() else np.nan )
cmp_vl["type"] = cmp_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
cmp_vl.loc[cmp_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"

header,labels = get_dta_header_labels(dataset_folder+"CMP"+os.sep+ "MPDataset_MPDS2020a_stata14.dta")

cmp_vl["description"] = cmp_vl["variable"].apply(lambda x: header[x] if x in header.keys() else np.nan )    

cmp_vl["description"] = cmp_vl["variable"].apply(lambda x: shared_description[x] if x in shared_description.keys() else x )



In [382]:
pip_ideol = pd.read_pickle( dataset_folder+"PIP"+os.sep+"ideological_indices_linked.zip" , compression = 'zip' )

pip_ideol = pip_ideol.rename(columns = {"Year":"year"})
pip_ideol.loc[pip_ideol["year"].isnull(),"year"]= 2017

pip_ideol["year"] = pip_ideol["year"].astype(int)

pip_ideol = pip_ideol.rename(columns = {"Quarter":"edate","CMP party code":"CMP party name",})
pip_ideol = pip_ideol.drop('party family membership',axis=1) ## it's just the CMP family code


pip_country_replace_dict = {"Great Britain":"United Kingdom","USA":"United States"}
pip_ideol["country"] = pip_ideol["Country name"].replace(pip_country_replace_dict).apply(lambda x: pf_country_to_code.loc[x])

dataset = "pip"
ds_brac = "("+dataset+")"
pip_ideol["dataset"] = dataset
pip_ideol["dataset_index"] = pip_ideol.index


pip_ideol = pip_ideol.rename(columns={x:ds_brac+x for x in pip_ideol.columns if x not in shared_columns})

pip_ideol[["year","partyfacts_id"]].isnull().sum(),pip_ideol.shape
# dummy party families, normalize

(year             0
 partyfacts_id    0
 dtype: int64,
 (3953, 48))

In [383]:
pip_ideol_vl = pd.DataFrame(columns = ["variable","group","type","integrated_variable","description"])

pip_ideol_vl["integrated_variable"] = pip_ideol.columns

pip_ideol_vl["variable"] = pip_ideol_vl["integrated_variable"].apply(lambda x: x.replace(ds_brac,""))

pip_ideol_var_to_group_dict = {
    'Unique ID for each observation (ISO.Quarter.Party)':'observation_variables',
    'Numeric country code (ISO)':'country_variables', 'Country name':'country_variables',
    'year':'observation_variables', 'edate':'observation_variables',
    'Consecutive number of the quarters':'party_electoral',
    'Dummy: 1 = Government party':'party_electoral', 'Existence of a party':'party_electoral',
    'CMP party name':'party_names', 'Name of party (english)':'party_names', 'party abbreviation':'party_names',
    'Party code according to CMP':'party_id', 'Name of the government party':'party_names',
    'partyfacts_id':"party_id",'partyfacts_id_cmp':"party_id", 'country':'country_variables', 'dataset':'source_variables',
    'Index Jahn: LR':"index_jahn_lr", 'Index Jahn: LR Importance':"index_jahn_lr",
    'Index Jahn: LR Core':"index_jahn_lr", 'Index Jahn: LR Core Importance':"index_jahn_lr",
    'Index Jahn: LR Plus':"index_jahn_lr", 'Index Jahn: LR Plus Importance':"index_jahn_lr",
    'Index Jahn: LR (Unstandardized)':"index_jahn_lr",
    'Index Jahn: LR Core (Unstandardized)':"index_jahn_lr",
    'Index Jahn: LR Plus (Unstandardized)':"index_jahn_lr",
    'Index Jahn: GG':"index_jahn_gg",'Index Jahn: GG Importance':"index_jahn_gg",
    'Index Jahn: GG Core':"index_jahn_gg",
    'Index Jahn: GG Core Importance':"index_jahn_gg", 'Index Jahn: GG Plus':"index_jahn_gg",
    'Index Jahn: GG Plus Importance':"index_jahn_gg",
    'Index Jahn: GG (Unstandardized)':"index_jahn_gg",
    'Index Jahn: GG Core (Unstandardized)':"index_jahn_gg",
    'Index Jahn: GG Plus (Unstandardized)':"index_jahn_gg",
    'Index Jahn/Oberst: LR Cohesion':"index_jahn_oberst_cohesion",
    'Index Jahn/Oberst: GG Cohesion':"index_jahn_oberst_cohesion",
    'Index Franzmann/Kaiser: Left-Right':"index_franzmann_kaiser",
    'Index Franzmann/Kaiser: Economic Dimension':"index_franzmann_kaiser",
    'Index Franzmann/Kaiser: Non-Economic Dimension':"index_franzmann_kaiser",
    'Index Budge et al.: RILE':"index_budge",
    'Index Budge et al.: Planned Economy':"index_budge",
    'Index Budge et al.: Market Economy':"index_budge",
    'Index Budge et al.: Welfare':"index_budge",
    'Index Budge et al.: International Peace':"index_budge",
    'Ad hoc-Index: Pro/Anti EU (Warntjen, Hix, Crombez 2008)':"adhoc_index",
    'Ad hoc-Index: GG light (Knill)':"adhoc_index",
    'dataset_index':'source_variables',
                          }


pip_ideol_var_description = {
 
}



 # country and year excepted as a shared variables
# PImPo_vl["description"] = PImPo_vl["variable"].apply(lambda x: pimpo_var_description[x] if x in pimpo_var_description.keys() else np.nan )    
pip_ideol_vl["group"] = pip_ideol_vl["variable"].apply(lambda x: pip_ideol_var_to_group_dict[x] if x in pip_ideol_var_to_group_dict.keys() else np.nan )
pip_ideol_vl["type"] = pip_ideol_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
pip_ideol_vl.loc[pip_ideol_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"
pip_ideol_vl["description"] = pip_ideol_vl["variable"].apply(lambda x: shared_description[x] if x in shared_description.keys() else x )


In [384]:
euromanifesto = pd.read_pickle( dataset_folder+"Euromanifesto"+os.sep+"Euromanifesto_linked.zip" , compression = 'zip' )

euromanifesto["year"] = euromanifesto["year"].astype(int)

euromanifesto[[x for x in euromanifesto.columns if "per_" in x]] = euromanifesto[[x for x in euromanifesto.columns if "per_" in x]].replace("Unavailable",np.nan).astype(float)
# euromanifesto.dtypes.value_counts()
subregion_list = ["Flanders","Wallonia","Great Britain","Northern Ireland"]
euromanifesto["region"] = euromanifesto["region"].apply(lambda x: np.nan if x not in subregion_list else x)
euromanifesto = euromanifesto.rename(columns = {"region":"country_sub_region"})

emp_country_replace_dict = {"Europe":"European Union","UK":"United Kingdom","The Netherlands":"Netherlands"}
euromanifesto["country"] = euromanifesto["country"].replace(emp_country_replace_dict).apply(lambda x: pf_country_to_code.loc[x])

euromanifesto[["total","emcs_id"]] = euromanifesto[["total","emcs_id"]].astype(int)
euromanifesto[["rile_mrg","planeco","markeco","welfare","pro_anti_EU"]] = euromanifesto[["rile_mrg","planeco","markeco","welfare","pro_anti_EU"]].astype(float)

float_list = ["EPseats","marpor","cphl","eep","member","EPvote","EPseatsum",
              'left', 'environ', 'liberta', 'religious', 'state', 'multicult',
              'integration', 'pubservice', 'redistribut', 'liberties', 'lifestyle',
              'immigration',"NATvote","NATseats"]

replace_dict = {"Inapplicable":np.nan,"Unavailable":np.nan, # currently unclear if there's any point in distinguishing
     "No EES code available":np.nan,"Unknown":np.nan,"No answer":np.nan,"Did not compete":np.nan,
     'No answer':np.nan,'Libertarian':1,'Authoritarian':10,'Religious':1,'Secular':10,
     'Free enterprise':1,'State interventionism':10,'Multiculturalism':1,'Ethnocentrism':10,
     'Pro EU-integration':1,'Anti EU-integration':10,'Raising taxes':1,'Cut taxes':10,
     'Fully in favour of redistribution from the rich to the poor':1,'Fully opposed to redistribution from the rich to the poor':10,
     'Civil liberties':1,'Law and Order':10,'Strong support for liberal policies':1,'Strong refusal of liberal policies':10,
     'Fully in favour of immigration':1,'Fully opposed to immigration':10,
     'Environmental protection':1,'Economic growth':10,'Left':1,'Right':10,                
}

euromanifesto[float_list] = euromanifesto[float_list].replace(replace_dict).astype(float)

## I think you should dummy pfamily, mfamily, ofamily
# but maybe, post-checking, expect mfamily to get dropped?

emp_family_replace_dict = {'Unavailable':np.nan, 'Unknown':np.nan, 'Coalition':'coalition', 'Ecologist':'green', 'Communist':'rad left',
       'Social Democratic':'socialist', 'Liberal':'liberal', 'Christian Democratic':'christdem', 'Conservative':'cons',
       'Nationalist':'rad right', 'Agrarian':'agrarian', 'Ethnic-regional':'regional', 'Special Issue':'special issue',
                           
       'Unknown':np.nan, 'Green Parties':'green', '(Post-)Communist Parties':'(post-)communist',
       'Social Democratic Parties':'socialist', 'Liberal Parties':'liberal',
       'Christian Democratic Parties':'christdem', 'Conservative Parties':'cons',
       'Nationalist Parties':'rad right', 'Agrarian Parties':'agrarian', 'Regional Parties':'regional',
       'Special Interest Parties':'special issue',
                           
       'Other':np.nan, 'Green Parties':'green', '(Post-)Communist Parties':'(post-)communist',
       'Social Democratic Parties':'socialist', 'Liberal Parties':'liberal',
       'Christian Democratic Parties':'christdem', 'Conservative Parties':'cons',
       'Nationalist Parties':'rad right', 'Agrarian Parties':'agrarian', 'Regional Parties':'regional',
       'Special Interest Parties':'special issue',
        }

family_col = "pfamily"
euromanifesto[family_col] = euromanifesto[family_col].replace(emp_family_replace_dict)
dummies = pd.get_dummies(euromanifesto[family_col] ,prefix=family_col, prefix_sep='__')
dummies.loc[euromanifesto[family_col].isnull(),:] = np.nan
euromanifesto[dummies.columns] = dummies

family_col = "ofamily"
euromanifesto[family_col] = euromanifesto[family_col].replace(emp_family_replace_dict)
dummies = pd.get_dummies(euromanifesto[family_col] ,prefix=family_col, prefix_sep='__')
dummies.loc[euromanifesto[family_col].isnull(),:] = np.nan
euromanifesto[dummies.columns] = dummies

family_col = "mfamily"
euromanifesto[family_col] = euromanifesto[family_col].replace(emp_family_replace_dict)
dummies = pd.get_dummies(euromanifesto[family_col] ,prefix=family_col, prefix_sep='__')
dummies.loc[euromanifesto[family_col].isnull(),:] = np.nan
euromanifesto[dummies.columns] = dummies


dataset = "ees14"
ds_brac = "("+dataset+")"
euromanifesto["dataset"] = dataset
euromanifesto["dataset_index"] = euromanifesto.index
euromanifesto = euromanifesto.rename(columns={x:ds_brac+x for x in euromanifesto.columns if x not in shared_columns})

euromanifesto[["year","partyfacts_id"]].isnull().sum(), euromanifesto.shape

(year              0
 partyfacts_id    77
 dtype: int64,
 (977, 805))

In [385]:
emp_vl = pd.DataFrame(columns = ["variable","group","type","integrated_variable","description"])
emp_vl["integrated_variable"] = euromanifesto.columns
emp_vl["variable"] = emp_vl["integrated_variable"].apply(lambda x: x.replace("("+dataset+")",""))

emp_var_to_group_dict = {
            
        'za_nr':'source_variables', 'version':'source_variables', 'doi':'source_variables',
        'partyname':'party_names', 'initials':'party_names',
        'year':'observation_variables','country_year':'observation_variables','emcs_year':'observation_variables',
        'country':'country_variables','country_sub_region':'country_variables',
        'EPseats':'party_electoral','EPvote':'party_electoral', 'EPseatsum':'party_electoral',
            'NATvote':'party_electoral', 'NATseats':'party_electoral',
            'eep':'party_electoral','member':'party_electoral',
            'gov':'party_electoral', 'pm':'party_electoral',
        'emcs':'party_id','ees':'party_id', 'marpor':'party_id', 'cphl':'party_id',
        'partyinfo':'party_electoral',
        'pfamily':'dummiable_variables', 'mfamily':'dummiable_variables', 'ofamily':'dummiable_variables',
        'group':'dummiable_variables','manif':'observation_variables',
        
       'left':'coder_ratings', 'environ':'coder_ratings','liberta':'coder_ratings', 
       'religious':'coder_ratings', 'state':'coder_ratings', 'multicult':'coder_ratings', 'integration':'coder_ratings',
       'pubservice':'coder_ratings', 'redistribut':'coder_ratings', 'liberties':'coder_ratings', 'lifestyle':'coder_ratings',
       'immigration':'coder_ratings',
       'rile_mrg':'computed_values', 'planeco':'computed_values', 'markeco':'computed_values', 'welfare':'computed_values',
       'pro_anti_EU':'computed_values', 'total':'observation_variables',
                         
       'ees_id':'party_id', 'ees_party':'party_names', 'emcs_id':'party_id', 'emcs_party':'party_names',
       'partyfacts_id_ees_id':'party_id', 'partyfacts_id_emcs':'party_id', 'partyfacts_id_cmp':'party_id',
       'partyfacts_id_ches':'party_id', 'partyfacts_id':'party_id', 'dataset':'source_variables',
       'dataset_index':'source_variables',
                                       
}

emp_var_to_group_dict.update({x.replace(ds_brac,""):"man_cod" for x in euromanifesto.columns if "per_" in x})
emp_var_to_group_dict.update({x.replace(ds_brac,""):"dummied_variables" for x in euromanifesto.columns if "family__" in x})

# pf_core_vl["description"] = pf_core_vl["variable"].apply(lambda x: pf_core_var_description[x] if x in pf_core_var_description.keys() else np.nan )    
emp_vl["group"] = emp_vl["variable"].apply(lambda x: emp_var_to_group_dict[x] if x in emp_var_to_group_dict.keys() else np.nan )
emp_vl["type"] = emp_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
emp_vl.loc[emp_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"

header,labels = get_dta_header_labels(dataset_folder+"Euromanifesto"+os.sep+ "ZA5102_v2-0-0.dta")

emp_vl["description"] = emp_vl["variable"].apply(lambda x: header[x] if x in header.keys() else np.nan )  

emp_vl["description"] = emp_vl["variable"].apply(lambda x: shared_description[x] if x in shared_description.keys() else x )

In [386]:
pf_core = partyfacts_core_parties[['country', 'partyfacts_id',  'name_short', 'name',
       'name_english', 'name_other', 'year_first', 'year_last', 'share',
       'share_year', 'new', 'wikipedia', 'description', 'comment', 'created',
       'modified']].copy()

pf_core["country"] = pf_core["country"].apply(lambda x: x.lower())

dataset = "pf"
ds_brac = "("+dataset+")"
pf_core["dataset"] = dataset

pf_core["dataset_index"] = pf_core.index

pf_core = pf_core.rename(columns={x:ds_brac+x for x in pf_core.columns if x not in shared_columns})


pf_core_vl = pd.DataFrame(columns = ["variable","group","type","integrated_variable","description"])
pf_core_vl["integrated_variable"] = pf_core.columns
pf_core_vl["variable"] = pf_core_vl["integrated_variable"].apply(lambda x: x.replace("("+dataset+")",""))

pf_core_var_to_group_dict = {'country':'country_variables', 'partyfacts_id':'party_id', 'dataset':'source_variables',
       'name_short':'party_names', 'name':'party_names','name_english':'party_names', 'name_other':'party_names',
       'year_first':'party_electoral', 'year_last':'party_electoral', 'share':'party_electoral',
       'share_year':'party_electoral', 'new':'notes', 'wikipedia':'party_id',
       'description':'notes', 'comment':'notes',
       'created':'source_variables','modified':'source_variables','dataset_index':'source_variables',
            
}
# could change these to vote/vote_year


pf_core_var_description = {"country":"ISO-code of the country",
"partyfacts_id":"Party Facts identification number",
"name_short":"Common abbreviation of party",
"name":"Name of party in language of origin",
"name_english":"Name of party in English",
"name_other":"Party name in languages with non-latin letters",
"year_first":"Foundation of the party",
"year_last":"Dissolvement of the party (blank if still existing)",
"share":"Result in a parliamentary election",
"share_year":"Year of the election result",
"new":"Genuinely new party",
"wikipedia":"Link to the respective Wikipedia homepage",
"description":"For further description",
"comment":"Section for comments",
"created":"Date of creation in Party Facts",
"modified":"Date of last modification",}


 
 
 # country and year excepted as a shared variables
pf_core_vl["description"] = pf_core_vl["variable"].apply(lambda x: pf_core_var_description[x] if x in pf_core_var_description.keys() else np.nan )
pf_core_vl["group"] = pf_core_vl["variable"].apply(lambda x: pf_core_var_to_group_dict[x] if x in pf_core_var_to_group_dict.keys() else np.nan )
pf_core_vl["type"] = pf_core_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
pf_core_vl.loc[pf_core_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"

pf_core_vl["description"] = pf_core_vl["variable"].apply(lambda x: shared_description[x] if x in shared_description.keys() else x )

In [387]:
parlgov_party = pd.read_pickle(dataset_folder+"PARLGOV"+os.sep+ "parlgov_party_linked.zip",compression='zip')


parlgov_party["country_name_short"] = parlgov_party["country_name_short"].apply(lambda x: x.lower())
parlgov_party = parlgov_party.rename(columns = {"country_name_short":"country"})

# dummy family and try to line it up with CHES family categories
# aligning to the ches family 'standard'
# note - socdem -> socialist and communist/socialist -> rad left, Right-wing -> rad right
# and no 'confessional' option just "special issue"
family_replace_dict = {'Liberal':'liberal', 'Conservative':'cons', 'Social democracy':'socialist', 'Right-wing':'rad right',
       'Special issue':'special issue', 'Communist/Socialist':'rad left', 'no family':'no family',
       'Christian democracy':'christdem', 'Green/Ecologist':'green', 'Agrarian':'agrarian', 'to be coded':np.nan}
parlgov_party["family_name"] = parlgov_party["family_name"].replace(family_replace_dict)
#family__no family
dummies = pd.get_dummies(parlgov_party["family_name"] ,prefix="family", prefix_sep='__')
dummies.loc[parlgov_party["family_name"].isnull(),:] = np.nan
parlgov_party[dummies.columns]  =dummies
dataset = "parlgov"
ds_brac = "("+dataset+")"

parlgov_party["dataset"] = dataset
parlgov_party["dataset_index"] = parlgov_party.index
parlgov_party = parlgov_party.rename(columns={x:ds_brac+x for x in parlgov_party.columns if x not in shared_columns})

parlgov_party[["partyfacts_id"]].isnull().sum(), parlgov_party.shape

(partyfacts_id    279
 dtype: int64,
 (1651, 39))

In [388]:
parlgov_party_vl = pd.DataFrame(columns = ["variable","group","type","integrated_variable","description"])
parlgov_party_vl["integrated_variable"] = parlgov_party.columns

parlgov_party_vl["variable"] = parlgov_party_vl["integrated_variable"].apply(lambda x: x.replace(ds_brac,""))

parlgov_party_var_to_group_dict = {'country':'country_variables', 'country_name':'country_variables',
       'party_name_short':'party_names','party_name_english':'party_names', 'party_name':'party_names', 'party_name_ascii':'party_names',
       'family_name_short':'dummiable_variables', 'family_name':'dummiable_variables', 'family_id':'dummiable_variables',
       'left_right':'parlgov_index', 'state_market':'parlgov_index', 'liberty_authority':'parlgov_index', 'eu_anti_pro':'parlgov_index',
       'cmp':'party_id', 'euprofiler':'party_id', 'ees':'party_id',
       'castles_mair':'party_id', 'huber_inglehart':'party_id', 'ray':'party_id', 'benoit_laver':'party_id', 'chess':'party_id',
       'country_id':'country_variables', 'party_id':'party_id', 'partyfacts_id':'party_id',
       'partyfacts_id_cmp':'party_id', 'partyfacts_id_ches':'party_id', 'partyfacts_id_ees':'party_id',
       'dataset':'source_variables','dataset_index':'source_variables',
       'family__agrarian':'dummied_variables', 'family__christdem':'dummied_variables', 'family__cons':'dummied_variables',
       'family__green':'dummied_variables', 'family__liberal':'dummied_variables', 'family__no family':'dummied_variables',
       'family__rad left':'dummied_variables', 'family__rad right':'dummied_variables', 'family__socialist':'dummied_variables',
       'family__special issue':'dummied_variables',
                                  }
    

            

# could change these to vote/vote_year

parlgov_variable = pd.read_pickle(dataset_folder+"PARLGOV"+os.sep+ "parlgov_variable.zip",compression='zip')
parlgov_party_var_description = parlgov_variable[parlgov_variable["table_name"]=="view_party"].set_index("name")["description"].to_dict()


 
 
 # country and year excepted as a shared variables
parlgov_party_vl["description"] = parlgov_party_vl["variable"].apply(lambda x: parlgov_party_var_description[x] if x in parlgov_party_var_description.keys() else np.nan )    
parlgov_party_vl["group"] = parlgov_party_vl["variable"].apply(lambda x: parlgov_party_var_to_group_dict[x] if x in parlgov_party_var_to_group_dict.keys() else np.nan )
parlgov_party_vl["type"] = parlgov_party_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
parlgov_party_vl.loc[parlgov_party_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"

parlgov_party_vl["description"] = parlgov_party_vl["variable"].apply(lambda x: shared_description[x] if x in shared_description.keys() else x )

In [389]:

parlgov_cabinet = pd.read_pickle( dataset_folder+"PARLGOV"+os.sep+ "parlgov_cabinet_linked.zip" ,compression='zip')

parlgov_cabinet[["election_date","start_date"]] = parlgov_cabinet[["election_date","start_date"]].apply(lambda x: pd.to_datetime(x))
parlgov_cabinet["year"] = parlgov_cabinet["start_date"].dt.year



# parlgov_cabinet[["year","partyfacts_id"]].isnull().sum(), parlgov_cabinet.shape

In [390]:
parlgov_election = pd.read_pickle(dataset_folder+"PARLGOV"+os.sep+ "parlgov_election_linked.zip",compression='zip')

parlgov_election["election_date"] = pd.to_datetime(parlgov_election["election_date"])
parlgov_election["year"] = parlgov_election["election_date"].dt.year
parlgov_election[["year","partyfacts_id"]].isnull().sum(), parlgov_election.shape

(year               0
 partyfacts_id    722
 dtype: int64,
 (8623, 18))

In [391]:
PImPo = pd.read_pickle( dataset_folder+"CMP"+os.sep+"PImPo_linked.zip", compression='zip' )

PImPo["year"] = PImPo["edate"].dt.year
PImPo["country"] = PImPo["country"].apply(lambda x: x.title()).apply(lambda x: pf_country_to_code.loc[x])

dataset = "pimpo"
ds_brac = "("+dataset+")"
PImPo["dataset"] = dataset
PImPo["dataset_index"] = PImPo.index

# normalize saliency variables to 0-1
PImPo[[x for x in PImPo.columns if "saliency" in x]] = PImPo[[x for x in PImPo.columns if "saliency" in x]]/100

PImPo = PImPo.rename(columns={x:ds_brac+x for x in PImPo.columns if x not in shared_columns})


PImPo[["year","partyfacts_id"]].isnull().sum(), PImPo.shape

(year             0
 partyfacts_id    0
 dtype: int64,
 (242, 20))

In [392]:
# https://manifesto-project.wzb.eu/down/datasets/pimpo/PImPo_codebook.pdf

PImPo_vl = pd.DataFrame(columns = ["variable","group","type","integrated_variable"])

PImPo_vl["integrated_variable"] = PImPo.columns
PImPo_vl["variable"] = PImPo_vl["integrated_variable"].apply(lambda x: x.replace(ds_brac,""))


pimpo_var_to_group_dict = {"country":"country_variables",
                           "party":"party_id",
                           "date":"observation_variables",
                           "edate":"observation_variables",
                           "totals":"observation_variables",
                           "totals_immi":"observation_variables",
                           "totals_inti":"observation_variables",
                           
                           "saliency":"immigration_integration_pos",
                           "saliency_immi":"immigration_integration_pos",
                           "saliency_inti":"immigration_integration_pos",
                           "immi_pos":"immigration_integration_pos",
                           "immi_pos_saliency":"immigration_integration_pos",
                           "inti_pos":"immigration_integration_pos",
                           "inti_pos_saliency":"immigration_integration_pos",
                           
                           "partyname":"party_names",
                           "partyfacts_id":"party_id",
                           "year":"observation_variables",
                           "dataset":"source_variables",'dataset_index':'source_variables',
                           "countryname":"country_variables",
                          }


pimpo_var_description = {
 'country': 'Harmonised country code',
 'countryname': 'Country name',
 'party': 'MARPOR party id',
 'partyname' : 'MARPOR party name',
 'partyfacts_id': "Partyfacts unique party id",
 'dataset': 'Source dataset',
 'date': 'Election date',
 'edate': 'Election date(datetime)',
 'year': 'Election year',
 'totals': 'Total QS coded',
 'totals_immi': 'Total QS on immigration',
 'totals_inti': 'Total QS on integration',
 'saliency': 'Fraction of QS on immi + inte',
 'saliency_immi': 'Fraction of QS on immigration',
 'saliency_inti': 'Fraction of QS on integration',
 'immi_pos': '(Pos ImmigQS-Neg ImmigQS)/(Pos + Neg + Neut ImmigQS)',
 'immi_pos_saliency': '1-(Neut ImmigQS/(Pos + Neg + Neut ImmigQS))',
 'inti_pos': '(Pos IntiQS-Neg IntiQS)/(Pos + Neg + Neut IntiQS)',
 'inti_pos_saliency': '1-(Neut IntiQS/(Pos + Neg + Neut IntiQS))',
 
}

# pimpo_group_to_type_dict = {'party_names':'obs_vars','country_variables':'obs_vars','observation_variables':'obs_vars',
#  'party_id':'obs_vars',
#  'source_variables':'dataset_vars',
#  'party_electoral':'elec_vars',
#  'dummied_variables':'placeholder!',
#  'dummiable_variables':'dropped',
#  'policy_dimensions':'expert_op', 'ideological_dimensions':'expert_op',
#        'eu_policy_questions':'expert_op', 'eu_integration_admission':'expert_op',
#        'leadership_divisions':'expert_op', 'ukraine_specific_questions':'expert_op',
#        'party_characteristics':'expert_op', 'eu_membership_requirements':'expert_op',
#        'immigration_integration_pos':'man_cod',          
                           
#                            }
 
 
 # country and year excepted as a shared variables
PImPo_vl["description"] = PImPo_vl["variable"].apply(lambda x: pimpo_var_description[x] if x in pimpo_var_description.keys() else np.nan )    
PImPo_vl["group"] = PImPo_vl["variable"].apply(lambda x: pimpo_var_to_group_dict[x] if x in pimpo_var_to_group_dict.keys() else np.nan )
PImPo_vl["type"] = PImPo_vl["group"].apply(lambda x: group_to_type_dict[x] if x in group_to_type_dict.keys() else np.nan )
PImPo_vl.loc[PImPo_vl["variable"].apply(lambda x: x in shared_columns),"type"] = "cross-dataset"

PImPo_vl["description"] = PImPo_vl["variable"].apply(lambda x: shared_description[x] if x in shared_description.keys() else x )

In [393]:
merged_vl = pd.concat([PImPo_vl,
    parlgov_party_vl,
    pf_core_vl,
    emp_vl,
    pip_ideol_vl,
    cmp_vl,
    CHES_vl,],axis=0)

merged_vl = merged_vl.drop_duplicates()

In [394]:
em_to_mcp_mapping = pd.read_csv( dataset_folder+"CMP"+os.sep+"emp_mpds_mapping.csv" )
em_to_mcp_mapping = em_to_mcp_mapping[['emp_2014', 'value', 'mp_hb4', 'mp_hb5']]

no_mapping = (em_to_mcp_mapping["mp_hb5"]=="0")
em_to_mcp_mapping["latest"] = em_to_mcp_mapping["mp_hb5"]
em_to_mcp_mapping.loc[no_mapping,"latest"] = "emp_"+em_to_mcp_mapping.loc[no_mapping,"emp_2014"]

mapping_dictionary = em_to_mcp_mapping.set_index("emp_2014")["latest"].to_dict()

# euromanifesto = euromanifesto.rename(columns=mapping_dictionary)

In [395]:
# set.intersection(set(euromanifesto.columns),set(manifesto_project.columns))

In [396]:
# mapping_dictionary

In [397]:
# [x for x in euromanifesto.columns if "per" in x]

# which 'governmental frame is the comparable one?'
# or the sum of all 3?

## boo - frustrating!

## ... I guess

In [398]:
df = pd.concat([CHES_comb,manifesto_project,pip_ideol,euromanifesto,PImPo])

In [400]:
df_means = df.groupby("partyfacts_id").mean()

# parlgov_party
# df_means
parlgov_party.set_index("partyfacts_id").shape, df_means.shape
# ((1651, 27), (1248, 1135))
# pd.Series(df_means.append( parlgov_party.set_index("partyfacts_id") ).index).value_counts()
# pd.Series(df_means.index).value_counts()
pf_index_int = set(df_means.index)

# parlgov_party.set_index("partyfacts_id").loc[df_means.index]
# pf_index_int = list(set.intersection(set(df_means.index),set(parlgov_party["partyfacts_id"])))
# [parlgov_party["partyfacts_id"]].apply(lambda x: x in pf_index_int
df_merged = df_means.merge( parlgov_party, how='left', on= "partyfacts_id")
df_merged = df_merged.merge( pf_core, how='left', on= "partyfacts_id")


##################

mask = merged_vl.apply(lambda x: x["type"] in ['man_cod', 'dummy_vars', 'expert_op', 'id_dim', 'elec_vars',
       'coder_ratings', ],axis=1)

reasonable_colums = [x for x in df_merged.columns if x in merged_vl["integrated_variable"][mask].values]
reasonable_colums = reasonable_colums + ['(pimpo)totals', '(pimpo)totals_immi', '(pimpo)totals_inti', '(manifesto)total','partyfacts_id']

df_merged_small = df_merged[reasonable_colums].copy()
df_merged_small["partyfacts_id"] = df_merged_small["partyfacts_id"].astype(int)
df_merged_small = df_merged_small.set_index("partyfacts_id")


# set partyfacts_id to index

# curious about these!
df_merged = df_merged_small

In [447]:
# parlgov_party["(parlgov)left_right"]
# merged_vl["type"].value_counts(dropna=False).index


# merged_vl[merged_vl["variable"].apply(lambda x: x in ["new","description","comment"])]

df_merged[merged_vl[merged_vl["type"]=="dummy_vars"]["integrated_variable"]].sum().sort_values().head(59)

(parlgov)family__no family                                            0.000000
(ches)mip_one__multiculturalism                                       1.000000
(ees14)mfamily__coalition                                             1.000000
(ches)mip_three__environment                                          2.000000
(ches)mip_one__social lifestyle                                       3.000000
(ches)mip_two__environment                                            3.000000
(ches)mip_two__urban vs rural                                         3.500000
(ees14)ofamily__agrarian                                              3.500000
(ees14)pfamily__agrarian                                              3.833333
(ches)mip_one__civil liberties                                        4.000000
(ches)intgroup__alliance of democrats                                 4.000000
(ches)mip_two__ethnic minorities                                      4.000000
(ches)mip_two__religious principles                 

{'per101b': 'per102',
 'per101a': 'per101',
 'per1011b': 'per102',
 'per1011a': 'per101',
 'per10111b': 'per102',
 'per10111a': 'per101',
 'per10112b': 'per102',
 'per10112a': 'per101',
 'per1012b': 'per102',
 'per1012a': 'per101',
 'per1013b': 'per102',
 'per1013a': 'per101',
 'per1014b': 'per102',
 'per1014a': 'per101',
 'per1015b': 'per102',
 'per1015a': 'per101',
 'per103b': 'emp_per103b',
 'per103a': 'per103_1',
 'per104b': 'per105',
 'per104a': 'per104',
 'per1041b': 'per105',
 'per1041a': 'per104',
 'per1042b': 'per105',
 'per1042a': 'per104',
 'per1043b': 'per105',
 'per1043a': 'per104',
 'per106b': 'emp_per106b',
 'per106a': 'per106',
 'per1061b': 'emp_per1061b',
 'per1061a': 'per106',
 'per1062b': 'emp_per1062b',
 'per1062a': 'per106',
 'per1063b': 'emp_per1063b',
 'per1063a': 'per106',
 'per107b': 'per109',
 'per107a': 'per107',
 'per108b': 'per110',
 'per108a': 'per108',
 'per1081b': 'per110',
 'per1081a': 'per108',
 'per1082b': 'per108',
 'per1082a': 'per110',
 'per2011b':

In [296]:
# df_merged.select_dtypes('object')

In [26]:
# parlgov_party
# int_pgov = parlgov_party[parlgov_party["partyfacts_id"].notnull()].copy()
# int_pgov["partyfacts_id"] = int_pgov["partyfacts_id"].astype(int)
# int_pgov = int_pgov.set_index("partyfacts_id")


# int_pgov.loc[pf_index_int]

# # match on 

# # df_means[int_pgov.columns][] = int_pgov.loc[pf_index_int]#,int_pgov.columns]
# df_means.loc[pf_index_int][int_pgov.columns] = int_pgov.loc[pf_index_int][int_pgov.columns]

# df[[x for x in df.columns if re.search("country$",x)]].drop_duplicates()[100:150]

In [27]:
# partyfacts_external_parties[partyfacts_external_parties["partyfacts_id"]==4630]

In [28]:
# df[df["partyfacts_id"]==57][[x for x in df.columns if re.search("country$",x)]]

In [29]:
# CHES_comb[ CHES_comb["partyfacts_id"]==602 ]#[[x for x in CHES_comb.columns if "id" in x]+["(ches)source"]]

In [30]:
# partyfacts_external_parties[ (partyfacts_external_parties["dataset_key"]=='ches')
#                             &(partyfacts_external_parties["dataset_party_id"]=="2618")]

# #
# #[partyfacts_external_parties["partyfacts_id"]==4630]

In [31]:
# partyfacts_external_parties[ (partyfacts_external_parties["dataset_party_id"]=="602")]

In [98]:
# df.groupby("partyfacts_id").first()[[x for x in df.columns if re.search("country$",x)]].drop_duplicates().apply(lambda x: len(x.dropna().unique()),axis=1).sort_values(ascending=False).head(50)

# 4714.0    2  BUG INTRODUCED BY ME (manual id dict issue in "integrating party datasets") -> FIXED 
# 4630.0    2  BUG INTRODUCED BY ME (country translation dict issue in "chapel hill datasets") -> FIXED 
# 602.0     2  BUG INTRODUCED BY ME (country translation dict issue in "chapel hill datasets") -> FIXED 
# 1328.0    2  BUG INTRODUCED BY ME (country translation dict issue in "chapel hill datasets") -> FIXED 
# 57.0      2  BUG INTRODUCED BY ME (country translation dict issue in "chapel hill datasets") -> FIXED 

# Done - countries now match between all datasets!

In [564]:
# %%time
# corrs = df_merged_small.corr()
# # ["partyfacts_id"]

In [419]:
## we should normalize countries too!

In [565]:
def not_zero_or_nan(x):
    if pd.isnull(x):
        return False
    elif x==0:
        return False
    else:
        return True

In [421]:
%%time
corrs = df_merged_small.corr()
corrs = corrs.where(np.triu(np.ones(corrs.shape),k=2).astype(np.bool))
df_corrs = corrs.stack().reset_index()
df_corrs.columns = ['Row','Column','Value']
# df.drop(df.index[df["Value"]==1.0],inplace=True)


df_corrs["NZ"] = df_corrs.apply(lambda x: (df_merged[x["Row"]].apply(not_zero_or_nan)&df_merged[x["Column"]].apply(not_zero_or_nan)).sum(),axis=1)
# 36 minutes here!
df_corrs["N"] = df_corrs.apply(lambda x: (df_merged[x["Row"]].notnull()&df_merged[x["Column"]].notnull()).sum(),axis=1)

# df_corrs.sort_values(by="Value")

Wall time: 43min 18s


In [423]:
# (df_merged["(ches)mip_one__social lifestyle"]==0)
df_corrs

Unnamed: 0,Row,Column,Value,NZ,N
0,(ches)vote,(ches)future,0.245066,132,148
1,(ches)vote,(ches)eu_ep,0.124846,343,356
2,(ches)vote,(ches)eu_fiscal,0.065551,132,148
3,(ches)vote,(ches)eu_employ,0.147335,208,222
4,(ches)vote,(ches)eu_cohesion,0.172689,360,375
...,...,...,...,...,...
589568,(pf)share_year,(pimpo)totals_inti,0.070380,98,98
589569,(pf)share_year,(manifesto)total,0.169961,1033,1033
589570,(pimpo)totals,(pimpo)totals_inti,0.738844,99,99
589571,(pimpo)totals,(manifesto)total,0.829728,109,109


In [424]:
# df_merged["(ches)conflict_res"].value_counts()

In [425]:
df_corrs[df_corrs["N"]>10].sort_values(by="Value").head(50)

Unnamed: 0,Row,Column,Value,NZ,N
78646,(ches)conflict_res,(ches)russia_role,-0.987668,12,12
78645,(ches)conflict_res,(ches)languages,-0.987145,12,12
78741,(ches)region_status,(ches)russia_role,-0.962091,12,12
70834,(ches)eu_good_gov,(ches)conflict_res,-0.937574,12,12
54951,(ches)cosmo,(ches)region_status,-0.921834,12,12
1149,(ches)eu_position,(ches)eu_benefit,-0.921185,269,336
71060,(ches)eu_good_gov,(parlgov)liberty_authority,-0.889403,16,16
499996,(ees14)per_v2_404b,(pf)year_last,-0.886484,1,26
471083,(ees14)per_v1_316a,(pf)year_last,-0.886484,1,26
65205,(ches)eu_benefit,(ches)eu_good_gov,-0.883873,10,12


In [426]:
df_corrs[df_corrs["N"]>10].sort_values(by="Value").tail(50)

Unnamed: 0,Row,Column,Value,NZ,N
425674,(ees14)per_v3_20122b,(ees14)per_v3_4087b,0.977018,1,181
424313,(ees14)per_v1_20122b,(ees14)per_v3_20122b,0.97703,2,181
240041,(manifesto)per1026,(ees14)per_v3_4011a,0.977419,1,178
355208,(ees14)per_v1_10111b,(ees14)per_v1_1012b,0.978043,2,181
588429,(ees14)pfamily__special issue,(ees14)ofamily__special issue,0.978258,15,249
169501,(ches)epgroup__Party of European Socialists (PES),(ches)intgroup__socialist international,0.978982,27,191
280951,(manifesto)per305_6,(ees14)per_v1_20121b,0.97901,1,127
588227,(ees14)pfamily__liberal,(ees14)ofamily__liberal,0.979066,38,249
588330,(ees14)pfamily__regional,(ees14)ofamily__regional,0.97921,24,249
522987,(ees14)per_v3_4084b,(ees14)per_v1_5012b,0.980254,1,181


In [427]:
not_id_vars = df_corrs[["Row","Column"]].applymap(lambda x: ("id" not in x) & (x not in shared_columns)).all(axis=1)

df_corrs_non_id = df_corrs[not_id_vars].copy()

df_corrs_non_id["Row_ds"] = df_corrs_non_id["Row"].apply(lambda x: (x.split("(")[1]).split(")")[0])
df_corrs_non_id["Col_ds"] = df_corrs_non_id["Column"].apply(lambda x: (x.split("(")[1]).split(")")[0])
df_corrs_ni_nm = df_corrs_non_id[df_corrs_non_id["Row_ds"]!=df_corrs_non_id["Col_ds"]].copy()

In [485]:
df_corrs_ni_nm[df_corrs_ni_nm["NZ"]>20].sort_values(by="Value").tail(100)[0:50]

Unnamed: 0,Row,Column,Value,NZ,N,Row_ds,Col_ds
40641,(ches)redist,(ees14)pubservice,0.786904,114,114,ches,ees14
51789,(ches)multicult,(parlgov)liberty_authority,0.787298,370,370,ches,parlgov
36431,(ches)pubserv,(ees14)pubservice,0.788116,114,114,ches,ees14
10936,(ches)lrgen,(pip)Index Franzmann/Kaiser: Left-Right,0.789114,192,192,ches,pip
210810,(manifesto)per414,(pip)Index Budge et al.: Market Economy,0.789905,446,555,manifesto,pip
36428,(ches)pubserv,(ees14)state,0.790288,217,217,ches,ees14
161298,(ches)family__socialist,(parlgov)family__socialist,0.79142,40,327,ches,parlgov
43426,(ches)civlib,(parlgov)liberty_authority,0.79152,363,363,ches,parlgov
300518,(manifesto)family__christdem,(ees14)ofamily__christdem,0.796394,21,205,manifesto,ees14
156680,(ches)family__liberal,(ees14)pfamily__liberal,0.796969,29,210,ches,ees14


In [429]:
df_corrs_ni_nm[df_corrs_ni_nm["NZ"]>20].sort_values(by="Value").head(50)

Unnamed: 0,Row,Column,Value,NZ,N,Row_ds,Col_ds
1474,(ches)eu_position,(ees14)integration,-0.856445,241,241,ches,ees14
2495,(ches)future,(ees14)integration,-0.853652,110,115,ches,ees14
66208,(ches)eu_benefit,(parlgov)eu_anti_pro,-0.823702,237,293,ches,parlgov
339904,(ees14)integration,(parlgov)eu_anti_pro,-0.815623,235,236,ees14,parlgov
65533,(ches)eu_benefit,(ees14)pro_anti_EU,-0.808238,165,217,ches,ees14
9874,(ches)eu_foreign,(ees14)integration,-0.803664,234,234,ches,ees14
72402,(ches)eu_budgets,(ees14)integration,-0.794761,181,181,ches,ees14
97698,(ches)anti_islam_rhetoric,(pimpo)immi_pos,-0.779071,61,64,ches,pimpo
3546,(ches)eu_ep,(ees14)integration,-0.751806,232,232,ches,ees14
6666,(ches)eu_cohesion,(ees14)integration,-0.74616,234,234,ches,ees14


In [459]:
df_merged[[x for x in df_merged if "family" in x]].corr().dropna(how='all')[[x for x in df_merged if "mfamily" in x]].loc[[x for x in df_merged if "(manifesto)family" in x]]

Unnamed: 0,(ees14)mfamily__agrarian,(ees14)mfamily__christdem,(ees14)mfamily__coalition,(ees14)mfamily__cons,(ees14)mfamily__green,(ees14)mfamily__liberal,(ees14)mfamily__rad left,(ees14)mfamily__rad right,(ees14)mfamily__regional,(ees14)mfamily__socialist,(ees14)mfamily__special issue
(manifesto)family__agrarian,0.933163,-0.067344,,-0.063803,-0.052018,-0.082243,-0.065029,-0.055452,-0.051992,-0.087033,-0.040517
(manifesto)family__christdem,-0.07131,0.913433,,-0.126158,-0.102855,-0.16262,-0.128583,-0.05726,-0.102804,-0.134402,-0.080115
(manifesto)family__coalition,,,,,,,,,,,
(manifesto)family__cons,-0.074193,-0.093148,,0.871103,-0.107013,-0.13099,-0.08865,-0.114079,-0.106959,-0.105216,-0.083354
(manifesto)family__green,-0.055743,-0.104092,,-0.098618,1.0,-0.12712,-0.100513,-0.085711,-0.080362,-0.134524,-0.062626
(manifesto)family__liberal,-0.085856,-0.120333,,-0.151891,-0.123835,0.904719,-0.15481,-0.132012,-0.123773,-0.16934,-0.096457
(manifesto)family__rad left,-0.068041,-0.127057,,-0.120375,-0.09814,-0.155165,0.976392,-0.10462,-0.098091,-0.164203,-0.076443
(manifesto)family__rad right,0.019062,-0.120586,,-0.114244,-0.093142,-0.115347,-0.116439,0.872361,-0.032333,-0.15584,0.002622
(manifesto)family__regional,-0.055743,-0.104092,,-0.040581,-0.080402,-0.12712,-0.100513,-0.085711,0.931475,-0.12877,-0.062626
(manifesto)family__socialist,-0.087711,-0.157938,,-0.142959,-0.126511,-0.166183,-0.158155,-0.134864,-0.126447,0.900042,-0.098541


In [473]:
df_first = df.groupby("partyfacts_id").first()
[x for x in df_first.columns if "mfamily" in x]

discrepancies = df_first[["(ees14)mfamily","(manifesto)family"]].apply(lambda x: True if pd.notnull(x["(ees14)mfamily"]) &\
                                                      pd.notnull(x["(manifesto)family"]) &\
                                                      (x["(ees14)mfamily"]!=x["(manifesto)family"]) \
                                                      else False, axis=1)

In [475]:
df_first[["(ees14)mfamily","(manifesto)family"]][discrepancies].merge( pf_core, how='left', on= "partyfacts_id")

Unnamed: 0,partyfacts_id,(ees14)mfamily,(manifesto)family,country,(pf)name_short,(pf)name,(pf)name_english,(pf)name_other,(pf)year_first,(pf)year_last,(pf)share,(pf)share_year,(pf)new,(pf)wikipedia,(pf)description,(pf)comment,(pf)created,(pf)modified,dataset,dataset_index
0,45.0,socialist,liberal,nld,D66,Democraten 66,Democrats 66,,1966,,15.5,1994.0,,https://en.wikipedia.org/wiki/Democrats_66,,,2012-12-29 10:52:34.820000+00:00,2018-07-07 20:07:29.344645+00:00,pf,4592
1,139.0,cons,regional,esp,CiU,Convergència i Unió,Convergence and Unity,,1977,2015.0,5.1,1989.0,False,https://en.wikipedia.org/wiki/Convergence_and_...,,,2012-12-29 10:52:51.970000+00:00,2018-07-07 20:09:33.885712+00:00,pf,5936
2,461.0,christdem,liberal,lva,LPP/LC,Latvijas Pirmā partija / Latvijas Ceļš,Latvian First Party / Latvian Way Party,,2006,2011.0,8.6,2006.0,,https://en.wikipedia.org/wiki/LPP/LC,,,2012-12-29 10:53:51.824000+00:00,2018-07-14 12:21:18.326124+00:00,pf,3786
3,472.0,cons,socialist,svn,SDS,Slovenska demokratska stranka,Slovenian Democratic Party,,1989,,29.3,2008.0,,https://en.wikipedia.org/wiki/Social_Democrati...,,,2012-12-29 10:53:53.970000+00:00,2018-07-07 20:09:18.119613+00:00,pf,5767
4,536.0,christdem,cons,dnk,KF,Konservative Folkeparti,Conservative People's Party,,1915,,23.7,1901.0,,https://en.wikipedia.org/wiki/Konservative_Fol...,,,2012-12-29 10:54:06.352000+00:00,2018-07-07 20:02:58.338165+00:00,pf,1642
5,553.0,regional,rad right,bel,VB,Vlaams Blok / Belang,Flemish Block / Interest,,1978,2004.0,12.0,2007.0,,https://en.wikipedia.org/wiki/Vlaams_Blok,,,2012-12-29 10:54:09.626000+00:00,2018-07-14 12:21:18.308649+00:00,pf,530
6,768.0,liberal,special issue,ita,IdV,Lista di Pietro -- Italia del Valori,List Di Pietro -- Italy of Values,,2001,,4.5,2008.0,,https://en.wikipedia.org/wiki/Italia_dei_Valori,,,2012-12-29 10:54:51.936000+00:00,2018-07-07 20:05:43.382873+00:00,pf,3321
7,800.0,socialist,cons,dnk,CD,Centrumdemokraterne,Centre Democrats,,1976,2005.0,8.3,1981.0,,https://en.wikipedia.org/wiki/CentrumDemokraterne,,,2012-12-29 10:54:58.153000+00:00,2018-07-07 20:02:57.454036+00:00,pf,1638
8,802.0,liberal,socialist,ita,PD,Partito Democratico,Democratic Party,,2007,,34.0,2008.0,,https://en.wikipedia.org/wiki/Democratic_Party...,,,2012-12-29 10:54:58.533000+00:00,2018-07-07 20:05:34.501399+00:00,pf,3276
9,852.0,rad left,cons,lva,V,Vienotība,Unity,,2010,,31.9,2010.0,,https://en.wikipedia.org/wiki/Unity_%28Latvia%29,,,2012-12-29 10:55:08.498000+00:00,2018-07-07 20:06:32.145733+00:00,pf,3826


In [539]:
emp_to_cmp_maps = {"(ees14)"+k.replace("per","per_v1_"):"(manifesto)"+v for k,v in mapping_dictionary.items() if "emp" not in v}
emp_to_cmp_maps.update({"(ees14)"+k.replace("per","per_v2_"):"(manifesto)"+v for k,v in mapping_dictionary.items() if "emp" not in v})
emp_to_cmp_maps.update({"(ees14)"+k.replace("per","per_v3_"):"(manifesto)"+v for k,v in mapping_dictionary.items() if "emp" not in v})
emp_to_cmp_maps = [(v,k) for k,v in emp_to_cmp_maps.items()]

In [523]:
# [x for x in emp_to_cmp_maps if x in list(df_corrs_ni_nm.set_index(["Row","Column"]).index.values)]
emp_to_cmp_maps

[('(ees14)per_v1_101b', '(manifesto)per102'),
 ('(ees14)per_v1_101a', '(manifesto)per101'),
 ('(ees14)per_v1_1011b', '(manifesto)per102'),
 ('(ees14)per_v1_1011a', '(manifesto)per101'),
 ('(ees14)per_v1_10111b', '(manifesto)per102'),
 ('(ees14)per_v1_10111a', '(manifesto)per101'),
 ('(ees14)per_v1_10112b', '(manifesto)per102'),
 ('(ees14)per_v1_10112a', '(manifesto)per101'),
 ('(ees14)per_v1_1012b', '(manifesto)per102'),
 ('(ees14)per_v1_1012a', '(manifesto)per101'),
 ('(ees14)per_v1_1013b', '(manifesto)per102'),
 ('(ees14)per_v1_1013a', '(manifesto)per101'),
 ('(ees14)per_v1_1014b', '(manifesto)per102'),
 ('(ees14)per_v1_1014a', '(manifesto)per101'),
 ('(ees14)per_v1_1015b', '(manifesto)per102'),
 ('(ees14)per_v1_1015a', '(manifesto)per101'),
 ('(ees14)per_v1_103a', '(manifesto)per103_1'),
 ('(ees14)per_v1_104b', '(manifesto)per105'),
 ('(ees14)per_v1_104a', '(manifesto)per104'),
 ('(ees14)per_v1_1041b', '(manifesto)per105'),
 ('(ees14)per_v1_1041a', '(manifesto)per104'),
 ('(ees14)pe

In [528]:
# df_corrs_ni_nm[df_corrs_ni_nm["Row"]=='(manifesto)per102'].head(50)
df_corrs_ni_nm[df_corrs_ni_nm["Column"]=='(ees14)per_v3_101b'].head(50)

Unnamed: 0,Row,Column,Value,NZ,N,Row_ds,Col_ds
399,(ches)vote,(ees14)per_v3_101b,-0.013168,9,186,ches,ees14
1488,(ches)eu_position,(ees14)per_v3_101b,7.5e-05,10,189,ches,ees14
2509,(ches)future,(ees14)per_v3_101b,-0.222186,3,89,ches,ees14
3560,(ches)eu_ep,(ees14)per_v3_101b,0.034204,10,185,ches,ees14
4578,(ches)eu_fiscal,(ees14)per_v3_101b,-0.228733,3,89,ches,ees14
5617,(ches)eu_employ,(ees14)per_v3_101b,-0.157231,5,126,ches,ees14
6680,(ches)eu_cohesion,(ees14)per_v3_101b,0.027583,10,185,ches,ees14
7752,(ches)eu_environ,(ees14)per_v3_101b,-0.147682,5,127,ches,ees14
8812,(ches)eu_asylum,(ees14)per_v3_101b,-0.024277,8,172,ches,ees14
9888,(ches)eu_foreign,(ees14)per_v3_101b,0.002686,10,185,ches,ees14


In [518]:
emp_to_cmp_maps

[('(ees14)per_v1_101b', '(manifesto)per102'),
 ('(ees14)per_v1_101a', '(manifesto)per101'),
 ('(ees14)per_v1_1011b', '(manifesto)per102'),
 ('(ees14)per_v1_1011a', '(manifesto)per101'),
 ('(ees14)per_v1_10111b', '(manifesto)per102'),
 ('(ees14)per_v1_10111a', '(manifesto)per101'),
 ('(ees14)per_v1_10112b', '(manifesto)per102'),
 ('(ees14)per_v1_10112a', '(manifesto)per101'),
 ('(ees14)per_v1_1012b', '(manifesto)per102'),
 ('(ees14)per_v1_1012a', '(manifesto)per101'),
 ('(ees14)per_v1_1013b', '(manifesto)per102'),
 ('(ees14)per_v1_1013a', '(manifesto)per101'),
 ('(ees14)per_v1_1014b', '(manifesto)per102'),
 ('(ees14)per_v1_1014a', '(manifesto)per101'),
 ('(ees14)per_v1_1015b', '(manifesto)per102'),
 ('(ees14)per_v1_1015a', '(manifesto)per101'),
 ('(ees14)per_v1_103a', '(manifesto)per103_1'),
 ('(ees14)per_v1_104b', '(manifesto)per105'),
 ('(ees14)per_v1_104a', '(manifesto)per104'),
 ('(ees14)per_v1_1041b', '(manifesto)per105'),
 ('(ees14)per_v1_1041a', '(manifesto)per104'),
 ('(ees14)pe

In [553]:
df_corrs_ni_nm_ind = df_corrs_ni_nm.set_index(["Row","Column"])

emp_to_cmp_maps = [x for x in emp_to_cmp_maps if x in list(df_corrs_ni_nm_ind.index)]


# emp_to_cmp_df = df_corrs_ni_nm.set_index(["Row","Column"]).loc[emp_to_cmp_maps]
# emp_to_cmp_df



# .loc[[("(ches)vote","(manifesto)pervote"),
#                                           ("(ches)vote","(manifesto)pervote"),
#                                           ('(manifesto)per102','(ees14)per_v1_101b', )]]

# df_corrs_ni_nm.set_index(["Row","Column"]).index.values

In [554]:
emp_to_cmp_df = df_corrs_ni_nm.set_index(["Row","Column"]).loc[emp_to_cmp_maps]
emp_to_cmp_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,NZ,N,Row_ds,Col_ds
Row,Column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(manifesto)per102,(ees14)per_v1_101b,0.136782,20,226,manifesto,ees14
(manifesto)per101,(ees14)per_v1_101a,0.327090,83,226,manifesto,ees14
(manifesto)per102,(ees14)per_v1_1011b,-0.031570,1,207,manifesto,ees14
(manifesto)per101,(ees14)per_v1_1011a,0.017238,16,207,manifesto,ees14
(manifesto)per102,(ees14)per_v1_10111b,-0.002120,1,151,manifesto,ees14
...,...,...,...,...,...,...
(manifesto)per706,(ees14)per_v3_706a,0.049410,37,226,manifesto,ees14
(manifesto)per706,(ees14)per_v3_7061a,0.068662,66,226,manifesto,ees14
(manifesto)per706,(ees14)per_v3_7062a,0.115359,29,226,manifesto,ees14
(manifesto)per706,(ees14)per_v3_7063a,0.151174,59,226,manifesto,ees14


In [556]:
emp_to_cmp_df.sort_values(by='Value').head(50)

# (manifesto)per705	(ees14)per_v3_7051a	-0.104599	24	226	manifesto	ees14
# Underprivileged Minority Groups
# Very general favourable references to underprivileged minorities who
# are defined neither in economic nor in demographic terms (e.g. the
# handicapped, homosexuals, immigrants, indigenous). Only includes
# favourable statements that cannot be classified in other categories
# (e.g. 503, 504, 604, 607 etc.)
# UMG: Handicapped+ 



Unnamed: 0_level_0,Unnamed: 1_level_0,Value,NZ,N,Row_ds,Col_ds
Row,Column,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(manifesto)per108,(ees14)per_v1_1082b,-0.144478,7,151,manifesto,ees14
(manifesto)per705,(ees14)per_v3_7051a,-0.104599,24,226,manifesto,ees14
(manifesto)per706,(ees14)per_v1_7063a,-0.103755,92,226,manifesto,ees14
(manifesto)per705,(ees14)per_v1_7054a,-0.09224,22,207,manifesto,ees14
(manifesto)per701,(ees14)per_v1_4081a,-0.09128,128,226,manifesto,ees14
(manifesto)per201_1,(ees14)per_v1_2011a,-0.089523,41,150,manifesto,ees14
(manifesto)per305_3,(ees14)per_v2_305a,-0.081742,37,150,manifesto,ees14
(manifesto)per408,(ees14)per_v3_40823a,-0.081729,9,151,manifesto,ees14
(manifesto)per108,(ees14)per_v3_314a,-0.080537,4,178,manifesto,ees14
(manifesto)per408,(ees14)per_v1_408a,-0.080256,148,226,manifesto,ees14


In [None]:
# (manifesto)per701	(ees14)per_v1_4081a	-0.091280	128	226	manifesto	ees14
# (manifesto)per701: Labour Groups Positive
# (ees14)per_v1_4081a: Creating Jobs (Positive) - National Level


# (manifesto)per408	(ees14)per_v1_408a	-0.080256	148	226	manifesto	ees14
# per408: Economic Goals (unspecified in any other category)
# per_v1_408a: Economic Goals: General - National Level

# (manifesto)per408	(ees14)per_v2_408a	-0.053688	177	226	manifesto	ees14
# per408: Economic Goals (unspecified in any other category)
# per_v1_408a: Economic Goals: General - European Level

# (manifesto)per202_1	(ees14)per_v1_202a	-0.042147	110	150	manifesto	ees14
# (manifesto)per202_1:Democracy General: Positive
# (ees14)per_v1_202a:Democracy+ - National

In [552]:
euromanifesto['(ees14)per_v3_70531a'].sum()

# df_corrs.set_index(["Row","Column"]).loc['(manifesto)per705', '(ees14)per_v2_70531a']

0.0

In [488]:
df_corrs_ni_nm["Row"].apply(lambda x: )

Unnamed: 0,Row,Column,Value,NZ,N,Row_ds,Col_ds
183,(ches)vote,(manifesto)pervote,0.926617,317,320,ches,manifesto
184,(ches)vote,(manifesto)absseat,0.644378,314,320,ches,manifesto
185,(ches)vote,(manifesto)totseats,-0.093048,322,325,ches,manifesto
186,(ches)vote,(manifesto)peruncod,0.011273,276,325,ches,manifesto
187,(ches)vote,(manifesto)per101,0.026725,213,325,ches,manifesto
...,...,...,...,...,...,...,...
589567,(pf)share_year,(pimpo)totals_immi,0.124098,94,94,pf,pimpo
589568,(pf)share_year,(pimpo)totals_inti,0.070380,98,98,pf,pimpo
589569,(pf)share_year,(manifesto)total,0.169961,1033,1033,pf,manifesto
589571,(pimpo)totals,(manifesto)total,0.829728,109,109,pimpo,manifesto


In [480]:
### Errors in assigning family???

manifesto_project[manifesto_project["partyfacts_id"]==461] # liberal???
euromanifesto[euromanifesto["partyfacts_id"]==852] # rad left???
euromanifesto[euromanifesto["partyfacts_id"]==1308] # socialist is pushing it - they were *once* in a coalition *with* a socialist party
euromanifesto[euromanifesto["partyfacts_id"]==1359] # similiarly, everyone pegs these are centre-right, liberal conservatives


Unnamed: 0,(ees14)za_nr,(ees14)version,(ees14)doi,(ees14)partyname,(ees14)initials,year,(ees14)EPseats,country,(ees14)country_sub_region,(ees14)country_year,...,(ees14)mfamily__cons,(ees14)mfamily__green,(ees14)mfamily__liberal,(ees14)mfamily__rad left,(ees14)mfamily__rad right,(ees14)mfamily__regional,(ees14)mfamily__socialist,(ees14)mfamily__special issue,dataset,dataset_index
91,5102.0,2.0.0 (2018-01-02),doi:10.4232/1.12830,Partido Popular,PP,1987,4.0,prt,,35_87,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,ees14,91
225,5102.0,2.0.0 (2018-01-02),doi:10.4232/1.12830,Partido Popular,PP,1994,3.0,prt,,35_94,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,ees14,225
356,5102.0,2.0.0 (2018-01-02),doi:10.4232/1.12830,Partido Popular,PP,1999,2.0,prt,,35_99,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,ees14,356
682,5102.0,2.0.0 (2018-01-02),doi:10.4232/1.12830,Partido Popular,PP,2009,2.0,prt,,35_09,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,ees14,682


In [484]:
# df_merged["(ches)mip_one__social lifestyle"].value_counts(dropna=False)
# df_merged["(emp)per_v3_6021b"].value_counts(dropna=False)
CHES_comb[CHES_comb["(ches)enlarge"].notnull()]["(ches)source"].value_counts()

2002_CHES_dataset_means.dta    73
Name: (ches)source, dtype: int64

In [46]:
# (cmp)per1022	(emp)per_v1_409b	0.995901	178	cmp	emp
# (per1022)Western States: Negative
# Negative mentions of Western states, including the USA and Germany.
# (per_v1_409b) Keynesian Demand Management (negative, national-level)

# Spurious correlation because of sparse datasets?
# (ches)mip_one__social lifestyle
# (per_v3_6021b)EU Integration (negative, unspecified level)

In [47]:
# Comforting

# 182063	(cmp)party	(pip)Party code according to CMP	1.000000	555	cmp	pip
# 328913	(emp)marpor	(parlgov)cmp	1.000000	165	emp	parlgov
# 182097	(cmp)party	(emp)marpor	1.000000	205	cmp	emp
# 304166	(pip)Party code according to CMP	(emp)marpor	1.000000	195	pip	emp
# 572049	(pimpo)party	(parlgov)cmp	1.000000	83	pimpo	parlgov
# 328897	(emp)marpor	(pimpo)party	1.000000	58	emp	pimpo
# 182764	(cmp)party	(pimpo)party	1.000000	109	cmp	pimpo
# 304831	(pip)Party code according to CMP	(pimpo)party	1.000000	108	pip	pimpo

In [49]:
# df["per_v1_4012a"].sum()

In [51]:
# df["per_v2_4012a"].sum()

In [52]:
# df["per_v3_4012a"].sum()

In [None]:
mapping_dictionary

In [None]:
{k:v for (k,v) in mapping_dictionary.items() if v=="per101"}

In [None]:
[x for x in df.columns if "per102" in x]

In [None]:
df_corrs[df_corrs["Row"]=="per607_1"].sort_values(by="Value").tail(50)

In [None]:
# per607_1	per_v2_3141a	0.716565	150

# multiculturalism positive <-> pos mentions about European Central Bank???
print(df["per607_1"].notnull().sum())
df["per607_1"].value_counts()

In [None]:
df["per_v2_3141a"].notnull().sum()
# 977 instances
df["per_v2_3141a"].value_counts()
# 803 are zero

In [None]:
df_merged["per607_1"].value_counts()
# 392 party notnull values
# 170 0s

In [None]:
mapping_dictionary

In [None]:
grouped_by_party = df.groupby("partyfacts_id").mean()

In [None]:
grouped_by_party.notnull().sum().sort_values().tail(50)

In [None]:
df["per_v1_101a"].notnull().sum()

In [None]:
df.dtypes.value_counts()

In [None]:
df.select_dtypes('object')

In [None]:
euromanifesto.dtypes.value_counts()

In [None]:
euromanifesto.select_dtypes('category')

In [None]:
euromanifesto.select_dtypes('float64')

In [None]:
partyfacts_core_parties

In [None]:
partyfacts_core_parties

In [None]:
df_merged.dtypes.value_counts()

In [None]:
df_merged.select_dtypes('object').dropna(how='all')

In [None]:
df["country"].value_counts()

In [None]:
df_merged#["country"].describe()

# 1924 - 2019, 768 countries, 1258 parties

In [None]:
manifesto_project[(manifesto_project["country"]=="gbr")&(manifesto_project["(cmp)party"]==51320)]

In [None]:
51620,51420

manifesto_project[(manifesto_project["country"]=="cze")&(manifesto_project["(cmp)party"]==82320)]
                 #].sort_values(by="(cmp)pervote").tail(20)
#

In [481]:

# C:\Users\Marios\Documents\GitHub\Thomas Prosser project\datasets\Party Level
df_corrs_ni_nm.to_pickle(dataset_folder+"Party Level"+os.sep+ "df_corrs_ni_nm.zip",compression='zip')
df.to_pickle(dataset_folder+"Party Level"+os.sep+ "df.zip",compression='zip')
df_merged.to_pickle(dataset_folder+"Party Level"+os.sep+ "df_merged.zip",compression='zip')

In [55]:
df.shape,df_merged.shape

((11819, 1214), (1258, 1180))

In [59]:
df.dtypes.value_counts()

float64           1044
float32            109
object              58
datetime64[ns]       3
dtype: int64

In [60]:
df.select_dtypes('object')

Unnamed: 0,(ches)eumember,country,(ches)party,(ches)family,(ches)source,(ches)partyname1,(ches)partyname2,(ches)intgroup,(ches)epgroup,(ches)party_name,...,(emp)mfamily,(emp)ofamily,(emp)group,(emp)manif,(emp)gov,(emp)pm,(emp)ees_party,(emp)emcs_party,(pimpo)partyname,(pimpo)countryname
0,Yes,bel,PCB/KP,rad left,1984-1999_dataset_means.dta,,,,,,...,,,,,,,,,,
1,Yes,bel,PS,socialist,1984-1999_dataset_means.dta,,,,,,...,,,,,,,,,,
2,Yes,bel,SP,socialist,1984-1999_dataset_means.dta,,,,,,...,,,,,,,,,,
3,Yes,bel,ECOLO,green,1984-1999_dataset_means.dta,,,,,,...,,,,,,,,,,
4,Yes,bel,AGALEV,green,1984-1999_dataset_means.dta,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,,nzl,,,,,,,,,...,,,,,,,,,New Zealand National Party,New Zealand
238,,nzl,,,,,,,,,...,,,,,,,,,New Zealand First Party,New Zealand
239,,nzl,,,,,,,,,...,,,,,,,,,Māori Party,New Zealand
240,,nzl,,,,,,,,,...,,,,,,,,,Māori Party,New Zealand


In [566]:
df_means_prt_yr = df.groupby(["partyfacts_id","year"]).mean()

# parlgov_party
# df_means
# parlgov_party.set_index(["partyfacts_id",:]).shape, df_means.shape
# ((1651, 27), (1248, 1135))
# pd.Series(df_means.append( parlgov_party.set_index("partyfacts_id") ).index).value_counts()
# pd.Series(df_means.index).value_counts()
# pf_index_int = set(df_means.index)

# parlgov_party.set_index("partyfacts_id").loc[df_means.index]
# pf_index_int = list(set.intersection(set(df_means.index),set(parlgov_party["partyfacts_id"])))
# [parlgov_party["partyfacts_id"]].apply(lambda x: x in pf_index_int
# df_merged = df_means.merge( parlgov_party, how='left', on= ["partyfacts_id","year"])
# df_merged = df_merged.merge( pf_core, how='left', on= ["partyfacts_id","year"])


##################

mask = merged_vl.apply(lambda x: x["type"] in ['man_cod', 'dummy_vars', 'expert_op', 'id_dim', 'elec_vars',
       'coder_ratings', ],axis=1)

reasonable_colums = [x for x in df_means_prt_yr.columns if x in merged_vl["integrated_variable"][mask].values]
reasonable_colums = reasonable_colums + ['(pimpo)totals', '(pimpo)totals_immi', '(pimpo)totals_inti', '(manifesto)total','partyfacts_id']
reasonable_colums.remove('partyfacts_id')

df_merged_small = df_means_prt_yr[reasonable_colums].copy()
# df_merged_small["partyfacts_id"] = df_merged_small["partyfacts_id"].astype(int)
# df_merged_small = df_merged_small.set_index("partyfacts_id")


# set partyfacts_id to index

# curious about these!
# df_merged = df_merged_small

In [567]:
# df_merged_small

In [568]:
%%time
corrs = df_merged_small.corr()
corrs = corrs.where(np.triu(np.ones(corrs.shape),k=2).astype(np.bool))
df_corrs = corrs.stack().reset_index()
df_corrs.columns = ['Row','Column','Value']
# df.drop(df.index[df["Value"]==1.0],inplace=True)


df_corrs["NZ"] = df_corrs.apply(lambda x: (df_merged[x["Row"]].apply(not_zero_or_nan)&df_merged[x["Column"]].apply(not_zero_or_nan)).sum(),axis=1)
# 36 minutes here!
df_corrs["N"] = df_corrs.apply(lambda x: (df_merged[x["Row"]].notnull()&df_merged[x["Column"]].notnull()).sum(),axis=1)

# df_corrs.sort_values(by="Value")

Wall time: 35min 51s


In [569]:
not_id_vars = df_corrs[["Row","Column"]].applymap(lambda x: ("id" not in x) & (x not in shared_columns)).all(axis=1)

df_corrs_non_id = df_corrs[not_id_vars].copy()

df_corrs_non_id["Row_ds"] = df_corrs_non_id["Row"].apply(lambda x: (x.split("(")[1]).split(")")[0])
df_corrs_non_id["Col_ds"] = df_corrs_non_id["Column"].apply(lambda x: (x.split("(")[1]).split(")")[0])
df_corrs_ni_nm = df_corrs_non_id[df_corrs_non_id["Row_ds"]!=df_corrs_non_id["Col_ds"]].copy()

In [570]:

# C:\Users\Marios\Documents\GitHub\Thomas Prosser project\datasets\Party Level
df_corrs_ni_nm.to_pickle(dataset_folder+"Party Level"+os.sep+ "df_corrs_ni_nm_pty_yr.zip",compression='zip')
# df.to_pickle(dataset_folder+"Party Level"+os.sep+ "df.zip",compression='zip')
df_merged_small.to_pickle(dataset_folder+"Party Level"+os.sep+ "df_merged_pty_yr.zip",compression='zip')