# Main dataset preprocessing and merging with supplementary data

In [1]:
import numpy
import pandas
import os
from os import path
import gender_guesser.detector as gender

### load up donation dataset

In [None]:
#data source: https://search.electoralcommission.org.uk/Search/Donations?currentPage=1&rows=10&sort=AcceptedDate&order=desc&tab=1&et=pp&et=ppm&et=tp&et=perpar&et=rd&isIrishSourceYes=true&isIrishSourceNo=true&prePoll=false&postPoll=true&register=gb&register=ni&register=none&optCols=Register&optCols=CampaigningName&optCols=AccountingUnitsAsCentralParty&optCols=IsSponsorship&optCols=IsIrishSource&optCols=RegulatedDoneeType&optCols=CompanyRegistrationNumber&optCols=Postcode&optCols=NatureOfDonation&optCols=PurposeOfVisit&optCols=DonationAction&optCols=ReportedDate&optCols=IsReportedPrePoll&optCols=ReportingPeriodName&optCols=IsBequest&optCols=IsAggregation
#filename is name of file in local storage, change to own path!
df = pandas.read_csv("Data/results.csv")
df.info()

In [3]:
print(df["RegulatedEntityType"].unique())
print()
print(df.columns)
print(len(df.columns))

['Regulated Donee' 'Third Party' 'Political Party' 'Permitted Participant']

Index(['ECRef', 'RegulatedEntityName', 'RegulatedEntityType', 'Value',
       'AcceptedDate', 'AccountingUnitName', 'DonorName',
       'AccountingUnitsAsCentralParty', 'IsSponsorship', 'DonorStatus',
       'RegulatedDoneeType', 'CompanyRegistrationNumber', 'Postcode',
       'DonationType', 'NatureOfDonation', 'PurposeOfVisit', 'DonationAction',
       'ReceivedDate', 'ReportedDate', 'IsReportedPrePoll',
       'ReportingPeriodName', 'IsBequest', 'IsAggregation',
       'RegulatedEntityId', 'AccountingUnitId', 'DonorId', 'CampaigningName',
       'RegisterName', 'IsIrishSource'],
      dtype='object')
29


In [4]:
df.head(15)

Unnamed: 0,ECRef,RegulatedEntityName,RegulatedEntityType,Value,AcceptedDate,AccountingUnitName,DonorName,AccountingUnitsAsCentralParty,IsSponsorship,DonorStatus,...,IsReportedPrePoll,ReportingPeriodName,IsBequest,IsAggregation,RegulatedEntityId,AccountingUnitId,DonorId,CampaigningName,RegisterName,IsIrishSource
0,C0583991,Labour Campaign For Electoral Reform,Regulated Donee,"£20,000.00",08/07/2024,,Joseph Rowntree Reform Trust,False,False,Company,...,,August 2024,False,False,17630,,99201.0,,Great Britain,False
1,C0579011,Labour Together,Regulated Donee,"£175,500.00",08/07/2024,,Lord David Sainsbury,False,False,Individual,...,,July 2024,False,False,3912,,99815.0,,Great Britain,False
2,C0579009,Labour Together,Regulated Donee,"£12,500.00",24/06/2024,,Mr Trevor Chinn,False,False,Individual,...,,July 2024,False,False,3912,,83243.0,,Great Britain,False
3,C0579010,Labour Together,Regulated Donee,"£325,000.00",21/06/2024,,Martin Taylor,False,False,Individual,...,,July 2024,False,False,3912,,72372.0,,Great Britain,False
4,C0579007,The Spring Lunch,Regulated Donee,"£6,000.00",21/06/2024,,FT Estates Ltd,False,False,Company,...,,July 2024,False,True,1621,,100697.0,,Great Britain,False
5,C0578696,Wes Streeting,Regulated Donee,"£1,500.00",01/06/2024,,Sir Trevor Chinn,False,False,Individual,...,,June 2024,False,True,6346,,93638.0,,Great Britain,False
6,C0578678,The Rt Hon Dr Liam Fox MP,Regulated Donee,"£10,000.00",28/05/2024,,Partners & Limited,False,False,Company,...,,June 2024,False,False,1035,,95230.0,,Great Britain,False
7,NC0578606,Andy Burnham,Regulated Donee,"£26,750.00",24/05/2024,,B & Combes LLP,False,False,Limited Liability Partnership,...,,June 2024,False,False,1469,,100595.0,,Great Britain,False
8,C0579008,Northern Campaign Group,Regulated Donee,"£60,000.00",24/05/2024,,Cohiba Support Services Ltd,False,False,Company,...,,July 2024,False,False,17447,,100698.0,,Great Britain,False
9,C0578682,Simon Clarke,Regulated Donee,"£5,000.00",24/05/2024,,John Hall,False,False,Individual,...,,June 2024,False,False,6668,,98913.0,,Great Britain,False


In [6]:
#print(df.loc[df["DonorStatus"] == "Individual"].head(15))

### PREP

In [7]:
wip_df = df.copy()
wip_df["Value"] = wip_df["Value"].apply(lambda x: x.replace("£", ""))
wip_df["Value"] = wip_df["Value"].apply(lambda x: x.replace(",", ""))
wip_df["Value"] = wip_df["Value"].astype('float')
print(wip_df.head(5))

# wip_df["gender"] = ""

#split off data for individuals
individuals_df = df.loc[df["DonorStatus"] == "Individual"].copy()
print("unprocessed: ")
print(individuals_df["DonorName"].head())



titles_list = ["Master", "Mr", "Sir", "Gentleman", "Sire", "Lord", "Esq", "Miss", "Mrs", "Ms", "Mistress", "Madam", "Ma'am", "Dame", "Lady", "Mx", "Excellency", "Dr", "Professor"]

for i in titles_list:
    # individuals_df["DonorName"] = individuals_df["DonorName"].replace(i, "")
    individuals_df["DonorName"] = individuals_df["DonorName"].apply(lambda x: str(x).replace(i, ""))

individuals_df["FirstNames"] = individuals_df["DonorName"].apply(lambda x: str(x).split()[0]) 

d = gender.Detector()

# Gender guesser setup adapted from Rahul Lalchandani's Genderizer code, source: https://medium.com/analytics-vidhya/how-to-find-gender-from-a-dataframe-of-names-43ec98377290
# Gender guesser source: https://pypi.org/project/gender-guesser/
def gender_guess(name):
    #print(name)
    return d.get_gender(u"{x}".format(x=name), u'great_britain')

# df['gender'] = individuals_df["FirstNames"].apply(lambda x: gender_guess(x))
individuals_df['gender'] = individuals_df["FirstNames"].apply(lambda x: gender_guess(x))

individuals_merge = individuals_df["gender"]

# wip_df = wip_df.merge(individuals_merge, how='left', on="DonorId")
wip_df = wip_df.join(individuals_merge)

print(wip_df.head(61))
print(individuals_merge.head(25))

      ECRef                   RegulatedEntityName RegulatedEntityType  \
0  C0583991  Labour Campaign For Electoral Reform     Regulated Donee   
1  C0579011                       Labour Together     Regulated Donee   
2  C0579009                       Labour Together     Regulated Donee   
3  C0579010                       Labour Together     Regulated Donee   
4  C0579007                      The Spring Lunch     Regulated Donee   

      Value AcceptedDate AccountingUnitName                     DonorName  \
0   20000.0   08/07/2024                NaN  Joseph Rowntree Reform Trust   
1  175500.0   08/07/2024                NaN          Lord David Sainsbury   
2   12500.0   24/06/2024                NaN               Mr Trevor Chinn   
3  325000.0   21/06/2024                NaN                 Martin Taylor   
4    6000.0   21/06/2024                NaN                FT Estates Ltd   

   AccountingUnitsAsCentralParty  IsSponsorship DonorStatus  ...  \
0                          Fal

In [7]:
# print(sum(value_wip["Value"] >=1000000))
# print()
# print(sum((value_wip["Value"] >=100000) & (value_wip["Value"] <=1000000)))
# print()
# print(sum((value_wip["Value"] >=10000) & (value_wip["Value"] <=100000)))
# print()
# print(sum((value_wip["Value"] >=5000) & (value_wip["Value"] <=10000)))
# print()
# print(sum((value_wip["Value"] >=2500) & (value_wip["Value"] <=5000)))
# print()
# print(sum((value_wip["Value"] >=1000) & (value_wip["Value"] <=2500)))
# print()
# print(sum((value_wip["Value"] >=0) & (value_wip["Value"] <=1000)))

In [9]:
tmp_df = wip_df.copy()
tmp_df["val_split"] = ""

def step_id(x, steps):
    idx_int = 0
    idx_int2 = 1

    for i in steps:
        if idx_int == len(steps)-1:
            x = ">{0}".format(steps[idx_int])
            break
        else: 
            if (x >= steps[idx_int]) and (x < steps[idx_int2]):
                x = "{0}-{1}".format(steps[idx_int], steps[idx_int2])
                break
            else:
                idx_int +=1
                idx_int2 +=1
                continue
    return x


steps = [0, 1000, 2500, 5000, 10000, 100000, 1000000]

tmp_df["val_split"] = tmp_df["Value"].apply(lambda x: step_id(x, steps))

print(tmp_df[["Value", "val_split"]].head(10))

wip_df["val_split"] = tmp_df["val_split"].copy()

print()
print(wip_df.head(10))

      Value       val_split
0   20000.0    10000-100000
1  175500.0  100000-1000000
2   12500.0    10000-100000
3  325000.0  100000-1000000
4    6000.0      5000-10000
5    1500.0       1000-2500
6   10000.0    10000-100000
7   26750.0    10000-100000
8   60000.0    10000-100000
9    5000.0      5000-10000

       ECRef                   RegulatedEntityName RegulatedEntityType  \
0   C0583991  Labour Campaign For Electoral Reform     Regulated Donee   
1   C0579011                       Labour Together     Regulated Donee   
2   C0579009                       Labour Together     Regulated Donee   
3   C0579010                       Labour Together     Regulated Donee   
4   C0579007                      The Spring Lunch     Regulated Donee   
5   C0578696                         Wes Streeting     Regulated Donee   
6   C0578678             The Rt Hon Dr Liam Fox MP     Regulated Donee   
7  NC0578606                          Andy Burnham     Regulated Donee   
8   C0579008             

In [10]:
# individual_rdy = individuals_df.iloc[:, [1, 2, 3, 29]]
# print(individual_rdy.info())
# print(individual_rdy.head())

## Create dataset for organizations

In [11]:
# Load dataset created with "supp_data_explore_V2.ipynb"
preprocessed_dir_path = os.path.join(os.getcwd(), "Preprocessed_Data")
dataset_path = os.path.join(preprocessed_dir_path, "supplementary_materials_2.csv")

In [12]:
# organizations_df = wip_df.loc[df["DonorStatus"] != "Individual"].copy()
organizations_df = wip_df.loc[wip_df["DonorStatus"] != "Individual"].copy()
# print(organizations_df.head(25))
# print(organizations_df.info())
# print()

df_orgs_merge = organizations_df.iloc[:, [3, 9, 12]]
print(df_orgs_merge.head(10))
print()

supp_df = pandas.read_csv(dataset_path)
supp_df = supp_df.rename(columns={"pcds": "Postcode"})
print(supp_df.head(10))
print(supp_df.info())


      Value                    DonorStatus  Postcode
0   20000.0                        Company  YO30 6WQ
4    6000.0                        Company   E15 2JA
6   10000.0                        Company  HP13 6NU
7   26750.0  Limited Liability Partnership   SK9 5EQ
8   60000.0                        Company  BB12 7TW
10   5000.0                        Company  SW1A 1PJ
14  20000.0     Unincorporated Association  OL13 9AA
15   3200.0                        Company    M3 4AP
16  16000.0                        Company   M40 5BJ
19   6179.0                          Other      0000

   Unnamed: 0     pcd7      pcd8 Postcode  Area code      ladnm  Average_age  \
0           0  AL2 1BB  AL2  1BB  AL2 1BB  E07000098  Hertsmere    40.680154   
1           1  AL2 1BT  AL2  1BT  AL2 1BT  E07000098  Hertsmere    40.680154   
2           2  AL2 1BU  AL2  1BU  AL2 1BU  E07000098  Hertsmere    40.680154   
3           3  AL2 1BX  AL2  1BX  AL2 1BX  E07000098  Hertsmere    40.680154   
4           4  A

In [13]:
# merged_df = df_orgs_merge.merge(supp_df, how='inner', on="Postcode")
print("left df shape: ", df_orgs_merge.shape)
print("right df shape: ", supp_df.shape)
print()

merged_df = df_orgs_merge.reset_index().merge(supp_df, how='left', on="Postcode").set_index('index')
print(merged_df.info())
print(merged_df.head())
print("merged df shape: ", merged_df.shape)
print()

localized_prep_df = merged_df.copy()
localized_prep_df = localized_prep_df.drop(["Postcode", "pcd7", "pcd8", "Area code", "Unnamed: 0"], axis=1)
print()
print(localized_prep_df.info())
# print(localized_prep_df.head())
print("localized_prep_df shape: ", localized_prep_df.shape)




# export_df = merged_df.copy()
# export_df = export_df.drop(["Postcode", "pcd7", "pcd8", "Area code", "Unnamed: 0"], axis=1)
# print()
# print(export_df.info())
# print(export_df.head())
# print("export df shape: ", export_df.shape)

left df shape:  (46612, 3)
right df shape:  (2165695, 28)

<class 'pandas.core.frame.DataFrame'>
Index: 46612 entries, 0 to 84468
Data columns (total 30 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   Value                                                                             46612 non-null  float64
 1   DonorStatus                                                                       46514 non-null  object 
 2   Postcode                                                                          45585 non-null  object 
 3   Unnamed: 0                                                                        38064 non-null  float64
 4   pcd7                                                                              38064 non-null  object 
 5   pcd8                                                   

In [35]:
tmp_join_df = localized_prep_df.copy()
tmp_join_df = tmp_join_df.drop(['Value', 'DonorStatus'], axis=1)
tmp_wip_df = wip_df.copy()

pre_export_df = tmp_wip_df.join(tmp_join_df)

print(pre_export_df.head(61))
print("_________________________________________")
print()
print(pre_export_df.info())
print("_________________________________________")
print()
to_drop = pre_export_df.copy()
to_drop_rows = to_drop.loc[(to_drop["DonorStatus"] != "Individual") & (to_drop["Average_age"].isna())]
print(to_drop_rows.index)
print(type(to_drop_rows.index))
to_drop_rows_idx = to_drop_rows.index
print("shape of rows to drop: ", to_drop_rows.shape)
print("dropping from total of {0} non-individual rows".format(sum(to_drop["DonorStatus"] != "Individual")))
print("_________________________________________")
print()
export_df = pre_export_df.copy()
export_df = export_df.drop(to_drop_rows_idx)
print("leaving total of {0} non-individual rows".format(sum(export_df["DonorStatus"] != "Individual")))
print("_________________________________________")
print()
print(export_df.head(61))
print("_________________________________________")
print()
export_df = export_df.reset_index(drop=True)
print(export_df.head(61))

       ECRef                   RegulatedEntityName RegulatedEntityType  \
0   C0583991  Labour Campaign For Electoral Reform     Regulated Donee   
1   C0579011                       Labour Together     Regulated Donee   
2   C0579009                       Labour Together     Regulated Donee   
3   C0579010                       Labour Together     Regulated Donee   
4   C0579007                      The Spring Lunch     Regulated Donee   
..       ...                                   ...                 ...   
56  V0578661            The Rt Hon Elizabeth Truss     Regulated Donee   
57  C0578652                        Mr Bim Afolami     Regulated Donee   
58  C0576264                           Alan Mak MP     Regulated Donee   
59  C0576322             The Rt Hon Dr Liam Fox MP     Regulated Donee   
60  C0576335                         Labour To Win     Regulated Donee   

       Value AcceptedDate AccountingUnitName  \
0    20000.0   08/07/2024                NaN   
1   175500.0   

In [31]:
preprocessed_dir_path = os.path.join(os.getcwd(), "Preprocessed_Data")
if os.path.exists(preprocessed_dir_path) != True:
    os.mkdir(preprocessed_dir_path)

out_path = os.path.join(preprocessed_dir_path, "cat_localized_gendered_df_2.csv")

export_df.to_csv(out_path)

In [44]:
print(export_df["RegulatedDoneeType"].unique())

['Members Association' 'MP - Member of Parliament' 'Mayor' 'Senedd Member'
 nan 'MSP - Member of the Scottish Parliament' 'Leadership Candidate'
 'Member of Registered Political Party' 'Police and Crime Commissioner'
 'MLA - Member of the Legislative Authority of Northern Ireland' 'Other'
 'Candidate' 'Cllr. - Member of a Local Authority'
 'GLA - Assembly Member (Greater London)'
 'AM - Member of the National Assembly for Wales'
 'MEP - Member of the European Parliament']


In [46]:
print(export_df["gender"].unique())
print(sum(export_df["gender"].isna()))
export_df["gender"] = export_df["gender"].fillna("not_a_person")
print(export_df["gender"].head(5))
print("_________________________________________")
print()
cols_trim = ["ECRef", "RegulatedEntityName", "AcceptedDate", "AccountingUnitName", "AccountingUnitsAsCentralParty", "IsSponsorship", "RegulatedDoneeType", "CompanyRegistrationNumber", "Postcode", "NatureOfDonation", "PurposeOfVisit", "DonationAction", "ReceivedDate", "ReportedDate", "IsReportedPrePoll", "ReportingPeriodName", "IsBequest", "RegulatedEntityId", "AccountingUnitId", "DonorId", "CampaigningName", "RegisterName", "IsIrishSource"]
trim_df = export_df.copy().drop(cols_trim, axis=1)
print("_________________________________________")
print()
print(trim_df.info())
print("_________________________________________")
print()
print(trim_df.head(15))

['not_a_person' 'male' 'mostly_male' 'female' 'andy' 'unknown'
 'mostly_female']
0
0    not_a_person
1            male
2            male
3            male
4    not_a_person
Name: gender, dtype: object
_________________________________________

_________________________________________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75922 entries, 0 to 75921
Data columns (total 31 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   RegulatedEntityType                                                               75922 non-null  object 
 1   Value                                                                             75922 non-null  float64
 2   DonorName                                                                         75918 non-null  object 
 3   DonorStatus                                 

In [67]:
print(trim_df[trim_df["DonorName"].isnull()].index)
print("_________________________________________")
print()
trim_df_clean = trim_df.copy()
last_drop = []
for i in trim_df[trim_df["DonorName"].isnull()].index:
    last_drop.append(i)
print(last_drop)
print("_________________________________________")
print()
trim_df_clean = trim_df_clean.drop(last_drop)
trim_df_clean = trim_df_clean.reset_index()
print(trim_df_clean.info())
print("_________________________________________")
print()
print(trim_df_clean.head(15))

Index([74941, 75219, 75418, 75439], dtype='int64')
_________________________________________

[74941, 75219, 75418, 75439]
_________________________________________

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75918 entries, 0 to 75917
Data columns (total 32 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   index                                                                             75918 non-null  int64  
 1   RegulatedEntityType                                                               75918 non-null  object 
 2   Value                                                                             75918 non-null  float64
 3   DonorName                                                                         75918 non-null  object 
 4   DonorStatus                                           

In [None]:
preprocessed_dir_path = os.path.join(os.getcwd(), "Preprocessed_Data")
if os.path.exists(preprocessed_dir_path) != True:
    os.mkdir(preprocessed_dir_path)

out_path = os.path.join(preprocessed_dir_path, "for_cat_analysis_1_DO_NOT_USE.csv")

trim_df_clean.to_csv(out_path)

In [69]:
#code taken and adapted from: https://www.geeksforgeeks.org/ml-one-hot-encoding/ 

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

df_to_enc = trim_df_clean.copy()
print("shape original: ", df_to_enc.shape)
print("_________________________________________")
print()
columns_to_transform = ["RegulatedEntityType", "DonorStatus", "DonationType", "IsAggregation", "gender"]
one_hot_encoded = encoder.fit_transform(df_to_enc[columns_to_transform])

one_hot_df = pandas.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(columns_to_transform))
print("shape encoded cols: ", one_hot_df.shape)
print("-----------------------------------------")
print()
print(f"head of encoded cols, \n{one_hot_df}")
print()
print("_________________________________________")
print()
# df_encoded = pandas.concat([df_to_enc, one_hot_df], axis=1)
df_encoded = df_to_enc.join(one_hot_df)

df_encoded = df_encoded.drop(columns_to_transform, axis=1)

print(df_encoded.info())
print("_________________________________________")
print()
print(f"Encoded data : \n{df_encoded}")

shape original:  (75918, 32)
_________________________________________

shape encoded cols:  (75918, 30)
-----------------------------------------

head of encoded cols, 
       RegulatedEntityType_Permitted Participant  \
0                                            0.0   
1                                            0.0   
2                                            0.0   
3                                            0.0   
4                                            0.0   
...                                          ...   
75913                                        0.0   
75914                                        1.0   
75915                                        0.0   
75916                                        1.0   
75917                                        1.0   

       RegulatedEntityType_Political Party  \
0                                      0.0   
1                                      0.0   
2                                      0.0   
3                   

In [71]:
preprocessed_dir_path = os.path.join(os.getcwd(), "Preprocessed_Data")
if os.path.exists(preprocessed_dir_path) != True:
    os.mkdir(preprocessed_dir_path)

out_path = os.path.join(preprocessed_dir_path, "encoded_2_for_ML.csv")

df_encoded.to_csv(out_path)