In [3]:
import pandas as pd

file_path = 'C:\\Users\\Kameron\\Documents\\ESG Thesis\\Data\\Corporate Ideology\\dime.gz'

# Specify the data types for each column
dtype_specification = {
    "bonica.cid": "float64",  # Changed to float64 to handle NaN values
    "contributor.type": "object",
    "num.distinct": "float64",  # Changed to float64 to handle NaN values
    "most.recent.contributor.name": "object",
    "most.recent.contributor.address": "object",
    "most.recent.contributor.city": "object",
    "most.recent.contributor.zipcode": "float64",  # Changed to float64 to handle NaN values
    "most.recent.contributor.state": "object",
    "most.recent.contributor.latitude": "float64",
    "most.recent.contributor.longitude": "float64",
    "most.recent.contributor.occupation": "object",
    "most.recent.contributor.employer": "object",
    "most.recent.transaction.id": "object",
    "most.recent.transaction.date": "object",
    "contributor.gender": "object",
    "is.corp": "object",
    "contributor.cfscore": "float64",
    "is.projected": "int64",
    "first_cycle_active": "int64",
    "last_cycle_active": "int64",
}

# Specify the data types for all amount columns (1980-2022)
for year in range(1980, 2024, 2):
    dtype_specification[f"amount.{year}"] = "float64"  # Changed to float64 to handle NaN values

# Read the CSV file with the specified dtypes
data = pd.read_csv(file_path, 
                   compression='gzip', 
                   encoding='latin-1',
                   dtype=dtype_specification)


# Similarly, you can do this for other columns if you need them as integers without NaNs


In [4]:
unique_values = data['is.corp'].unique()

# Display the unique values
print(unique_values)

[nan 'corp' 'union']


In [13]:
# DataFrame where 'is.corp' is 'union'
corps = data[data['contributor.type'] == 'C']

# DataFrame where 'is.corp' is 'corp'
execs = data[data['contributor.type'] == 'I']

In [None]:
import pandas as pd

# Define the path to the CSV file
file_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\indivss.csv'

# Specify the data types for the columns
data_types = {
    "Cycle": str,
    "FECTransID": str,
    "ContribID": str,
    "Contrib": str,
    "RecipID": str,
    "Orgname": str,
    "UltOrg": str,
    "RealCode": str,
    "Date": str,
    "Amount": 'int64',
    "Street": str,
    "City": str,
    "State": str,
    "Zip": str,
    "RecipCode": str,
    "Type": str,
    "CmteID": str,
    "OtherID": str,
    "Gender": str,
    "Microfilm": str,
    "Occupation": str,
    "Employer": str,
    "Source": str
}

# Set the batch size for chunksize


In [1]:
import pandas as pd


# Define the path to your file
file_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\indivss.csv'

columns_to_use = ['Zip', 'City', 'State', 'Gender', 'Employer', 'Street', 'UltOrg', 'Orgname', 'Contrib']

df = pd.read_csv(
    file_path,
    usecols=columns_to_use,
    dtype={
        'Zip': str,
        'City': str,
        'State': str,
        'Gender': str,
        'Employer': str,
        'Street': str,
        'UltOrg': str,
        'Orgname': str,
        'Contrib': str
    }
)

# Convert column names to lowercase
df.columns = df.columns.str.lower()

# Apply lowercase conversion only to columns with string data (object dtype)
for column in df.select_dtypes(include='object').columns:
    df[column] = df[column].str.lower()
    
regex_pattern = r'([^,]+),\s*([^ ]+)\s+(\w)'

# Extract the components into new columns
df[['last', 'first', 'middle']] = df['contrib'].str.extract(regex_pattern, expand=True)

In [1]:
import polars as pl

# Define the path to your file
file_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\indivss.csv'

columns_to_use = ['Zip', 'City', 'State', 'Gender', 'Employer', 'Street', 'UltOrg', 'Orgname', 'Contrib']

# Load the data with the specific columns and datatypes
df = pl.read_csv(
    file_path,
    columns=columns_to_use,
    schema_overrides={
        'Zip': pl.Utf8,
        'City': pl.Utf8,
        'State': pl.Utf8,
        'Gender': pl.Utf8,
        'Employer': pl.Utf8,
        'UltOrg': pl.Utf8,
        'Orgname': pl.Utf8,
        'Contrib': pl.Utf8
    }
)

# Convert column names to lowercase
df = df.rename({col: col.lower() for col in df.columns})

# Apply lowercase conversion only to columns with string data
string_columns = [col for col, dtype in df.schema.items() if dtype == pl.Utf8]
df = df.with_columns(
    [pl.col(column).str.to_lowercase().alias(column) for column in string_columns]
)

# Regex pattern for capturing the last name
regex_last = r'([^,]+)'

# Extract last name and update DataFrame
df = df.with_columns([
    pl.col("contrib").str.extract(regex_last, group_index=1).alias("last")
])

# Regex pattern for capturing the middle initial
regex_middle = r'\b([A-Za-z])\b\s*$'

# Extract middle initial and update DataFrame
df = df.with_columns([
    pl.col("contrib").str.extract(regex_middle, group_index=1).alias("middle")
])

# Regex pattern for capturing the first name
regex_first = r',\s+([A-Za-z\s]+)'

# Extract first name and update DataFrame
df = df.with_columns([
    pl.col("contrib").str.extract(regex_first, group_index=1).alias("first")
])

df = df.with_columns(
    pl.col("first").str.replace(r"(\s+[A-Za-z]\.?)\s*$", "").alias("first")
)

In [2]:
# Concatenate first, middle, and last names into a new column 'full'
# Handling None values using `when` and `then` clauses
df = df.with_columns(
    pl.when(pl.col("middle").is_not_null())
    .then(pl.col("first") + pl.col("middle") + pl.col("last"))
    .otherwise(pl.col("first") + pl.col("last"))
    .alias("full")
)

# Function to remove all non-alphabetical characters including spaces
def remove_all_non_alpha(col):
    return col.str.replace("[^a-zA-Z]", "")

# Apply the function to remove non-alphabetical characters from all relevant columns
df = df.with_columns([
    remove_all_non_alpha(pl.col("first")).alias("first"),
    remove_all_non_alpha(pl.col("middle")).alias("middle"),
    remove_all_non_alpha(pl.col("last")).alias("last"),
    remove_all_non_alpha(pl.col("full")).alias("full"),
])

In [4]:
def remove_all_non_spaces(col):
    return col.str.replace("[^a-zA-Z0-9_]", "")

df = df.with_columns([
    remove_all_non_spaces(pl.col("employer")).alias("employer"),
    remove_all_non_spaces(pl.col("zip")).alias("zip"),
    remove_all_non_spaces(pl.col("city")).alias("city"),
    remove_all_non_spaces(pl.col("ultorg")).alias("ultorg"),
    remove_all_non_spaces(pl.col("orgname")).alias("orgname")])

In [5]:
# Add a unique_id column starting at 1
df = df.with_columns([
    pl.arange(1, df.height + 1).alias("unique_id")
])

In [6]:
df

contrib,orgname,ultorg,street,city,state,zip,gender,employer,last,middle,first,full,unique_id
str,str,str,str,str,str,str,str,str,str,str,str,str,i64
"""diener, michelle""","""hotelscom""",,,"""surfside""","""ca""","""33154""","""f""",,"""diener""",,"""michelle""","""michellediener""",1
"""freedman, joel""","""consultant""",,,"""southglastonbury""","""ct""","""06073""","""m""","""selfemployed""","""freedman""",,"""joel""","""joelfreedman""",2
"""golden, warren s""","""citywidemobile response""",,,"""mahopac""","""ny""","""10541""","""m""","""citywidemobile response""","""golden""","""s""","""warren""","""warrensgolden""",3
"""koch, larry j""","""deloitte& touche""","""deloittellp""",,"""wilton""","""ct""","""06897""","""m""","""deloitte& touche llp""","""koch""","""j""","""larry""","""larryjkoch""",4
"""larson, sherry""","""businessoptions""",,,"""montrose""","""co""","""81403""","""f""","""businessoptions""","""larson""",,"""sherry""","""sherrylarson""",5
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""stuart, charles""","""accorhotels""",,,"""sandiego""","""ca""","""92130""","""m""","""accorhotels""","""stuart""",,"""charles""","""charlesstuart""",145819550
"""stuart, david""","""food& nutrient impact llc""",,,"""hershey""","""pa""","""17033""","""m""","""food& nutrient impact llc""","""stuart""",,"""david""","""davidstuart""",145819551
"""stuart, janet""","""generalmotors""",,,"""grandrapids""","""mi""","""49506""","""f""","""generalmotors corp.""","""stuart""",,"""janet""","""janetstuart""",145819552
"""stuart, janet""","""generalmotors""",,,"""grandrapids""","""mi""","""49506""","""f""","""generalmotors corp.""","""stuart""",,"""janet""","""janetstuart""",145819553
