In [11]:
import pandas as pd
import gzip

In [12]:
# converting this table from wide to long is resource intensive - commenting this out because I saved a transformed table locally


# def read_tbl():
#     df = pd.read_excel("../../../../data/NM/23-6-26/copy_of_ipra_request_po16267_fulldata__sheets-20231220144255.xlsx")
#     return df 

# # Select the columns needed for processing
# columns_to_keep = [
#     'Person Certification #', 'Person First Name', 'Person Middle Name',
#     'Person Last Name', 'Person Suffix', 'Person Gender', 'Person Date of Birth',
#     'Year of Birth', 'Person EEOC Category',
#     'Employment Start Date', 'Employment End Date', 'Employment Appointment Type*',
#     'Employment Employment Type*', 'Employment Title/Rank (Current)',
#     'Employment Status', 'Is Primary Employment', 'Employing Organization Name',
#     'Employing Organization Agency Type'
# ]

# # Start by filtering out the relevant columns
# filtered_df = df[columns_to_keep]

# # Identify the columns with the same employment data that might have a number suffix
# employment_columns = ['Employment Start Date', 'Employment End Date', 
#                       'Employment Appointment Type*', 'Employment Employment Type*', 
#                       'Employment Title/Rank (Current)', 'Employment Status', 
#                       'Is Primary Employment', 'Employing Organization Name', 
#                       'Employing Organization Agency Type']

# # Create an empty DataFrame to store the cleaned data
# cleaned_df = pd.DataFrame()

# # Iterate through the rows and extract each employment stint for each person
# for idx, row in df.iterrows():
#     person_info = row[['Person Certification #', 'Person First Name', 'Person Middle Name',
#                        'Person Last Name', 'Person Suffix', 'Person Gender', 
#                        'Person Date of Birth', 'Year of Birth', 'Person EEOC Category']]

#     for col_base in employment_columns:
#         if pd.notnull(row[col_base]):
#             employment_info = {col_base: row[col_base] for col_base in employment_columns}
#             combined_info = pd.concat([person_info, pd.Series(employment_info)], axis=0)
#             cleaned_df = pd.concat([cleaned_df, combined_info.to_frame().T], ignore_index=True)


In [13]:
def clean_dates(df):
    df.loc[:, "start_date"] = df.start_date.str.replace(r"(\w+):(\w+):(\w+)", "", regex=True)
    df.loc[:, "end_date"] = df.end_date.str.replace(r"(\w+):(\w+):(\w+)", "", regex=True)
    return df 

def clean_race(df):
    df.loc[:, "race"] = df.race.str.replace(r"Caucasian", "white", regex=False)
    return df 

def clean_agency(df):
    df.loc[:, "agency_name"] = (df
                           .agency_name
                           .str.lower()
                           .str.strip()
                           .str.replace(r"dept\.", "department", regex=True)
                           .str.replace(r"&", "and", regex=False)
                           .str.replace(r"\bpd\b", "police department", regex=True)
                           .str.replace(r"(\w+)  (\w+)", r"\1 \2", regex=True)
                           .str.replace(r"mex\.", "mexico", regex=True)

    )
    df = df[~((df.agency_name.str.contains("fire")))]
    return df

def read_tbl():
    # df = pd.read_excel("../../../../data/NM/23-6-26/copy_of_ipra_request_po16267_fulldata__sheets-20231220144255.xlsx")
    df = pd.read_csv("../data/output/NM.csv")

    df = df.rename(columns={"Person Certification #": "person_nbr", 
                           "Person First Name": "first_name", 
                           "Person Middle Name": "middle_name", 
                           "Person Last Name": "last_name", 
                           "Person Suffix": "suffix",
                           "Person Gender": "sex", 
                           "Person EEOC Category": "race",
                           "Year of Birth": "year_of_birth", 
                           "Employment Start Date": "start_date", 
                           "Employment End Date": "end_date", 
                           "Employment Employment Type*": "employment_type", 
                           "Employment Status": "employment_status", 
                           "Employing Organization Name": "agency_name", 
                           "Employing Organization Agency Type": "agency_type", 
                           "Employment Title/Rank (Current)": "rank"})
    
    df = df.fillna("")
    
    df = df[["person_nbr", "first_name", "middle_name", "last_name", "suffix", "sex", 
             "year_of_birth", "race", "start_date", "end_date", "employment_type", 
             "rank", "employment_status", "agency_name", "agency_type" ]]
    return df

df = read_tbl()


df = df.pipe(clean_dates).pipe(clean_race).pipe(clean_agency)




  df = pd.read_csv("../data/output/NM.csv")


In [14]:
def clean_special_characters(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].apply(lambda x: ''.join(char for char in str(x) if ord(char) >= 32 or char == '\n'))
    return df

def trim_whitespace(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.strip()
    return df

def standardize_dates(df):
    date_columns = ['start_date', 'end_date']
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d')
    return df

def handle_newlines(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.replace('\n', ' ')
    return df

def ensure_utf8(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].apply(lambda x: x.encode('utf-8', errors='ignore').decode('utf-8') if isinstance(x, str) else x)
    return df

def post_process(df):
    return (df
            .pipe(clean_special_characters)
            .pipe(trim_whitespace)
            .pipe(standardize_dates)
            .pipe(handle_newlines)
            .pipe(ensure_utf8)
           )

df = post_process(df)


df = df.drop_duplicates(subset=["person_nbr", "agency_name"])

df = df.sort_values("person_nbr")
df

# df.loc[:, "full_name"] = df.first_name.astype(str).fillna("") + " " + df.last_name.astype(str).fillna("") 

# # with gzip.open("../data/output/new mexico-processed.csv.gz", 'wt') as f:
# #     df.to_csv(f, index=False)

Unnamed: 0,person_nbr,first_name,middle_name,last_name,suffix,sex,year_of_birth,race,start_date,end_date,employment_type,rank,employment_status,agency_name,agency_type
23553,00-0001-P,Joshua,D,Anderson,,Male,1977,white,2000-05-22,2019-10-31,Full Time,Sergeant,Retired,bernalillo police department,Law Enforcement Agency
79508,00-0002-P,Phillip,B,Francisco,,Male,1976,Hispanic,1999-07-18,2001-06-23,,Patrol Officer,Resigned,aztec police department,Law Enforcement Agency
24064,00-0003-P,Angela,D,Gettler,,Female,1975,Native American,2001-11-26,2003-05-15,,Patrol Officer,Termination,bloomfield police department,Law Enforcement Agency
70240,00-0004-P,Robert,D,Gross,,Male,1970,white,1999-07-26,2016-08-31,,Deputy Sheriff,Retired,san juan county sheriff's office,Law Enforcement Agency
18582,00-0005-P,Bernie,J,Guffey,,Male,1970,white,1999-06-04,2001-10-31,,Patrol Officer,Termination,aztec police department,Law Enforcement Agency
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48407,99-0433-P,Gwendolyn,K,Smith,,Female,1977,Native American,1999-05-03,2010-05-21,,Patrol Officer,Resigned,navajo department of public safety,Law Enforcement Agency
47327,99-0434-P,Dale,E,West,,Male,1971,white,1999-08-08,,Full Time,Investigator,Active,navajo department of public safety,Law Enforcement Agency
48400,99-0435-P,Ronald,,Williams,,Male,1974,Native American,1999-07-19,2010-02-22,,Patrol Officer,Resigned,navajo department of public safety,Law Enforcement Agency
47482,99-0436-P,Gilbert,,Yazzie,,Male,1971,Native American,1999-08-04,,Full Time,Patrol Officer,Active,navajo department of public safety,Law Enforcement Agency


In [15]:
# df.to_csv("../data/output/new-mexico_index.csv", index=False)