### Data Cleaning
1. The data retrieved from CDC were in multiple files that are year dependent. The files will need to be combined
2. Once the files are combined then all unnecessary attributes will be removed. 


## 1. Table Combine

In [12]:
import os
import pandas as pd

def combine_csv_files(input_folder, output_filename):
    # Fixed output folder location
    output_folder = r"C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data"
    output_file = os.path.join(output_folder, output_filename)
    
    # List to hold dataframes
    dataframes = []
    
    # Iterate over all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_folder, filename)
            df = pd.read_csv(file_path)
            dataframes.append(df)
    
    # Concatenate all dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the combined dataframe to the specified output file
    combined_df.to_csv(output_file, index=False)

# File combination results 
input_folder1 = r"C:\Users\laura\OneDrive\Documents\capstone-dooley\data\antibiotic usage\prescriptions"
output_filename1 = "antibiotic_usage.csv"
input_folder2 = r"C:\Users\laura\OneDrive\Documents\capstone-dooley\data\resistance"
output_filename2 = "resistance.csv"
input_folder3 = r"C:\Users\laura\OneDrive\Documents\capstone-dooley\data\antibiotic usage\saar"
output_filename3 = "saar.csv"

combine_csv_files(input_folder1, output_filename1)
combine_csv_files(input_folder2, output_filename2)
combine_csv_files(input_folder3, output_filename3)

print(f"All CSV files from {input_folder1} have been combined and saved to clean_data as {output_filename1}.")
print(f"All CSV files from {input_folder2} have been combined and saved to clean_data as {output_filename2}.")
print(f"All CSV files from {input_folder3} have been combined and saved to clean_data as {output_filename3}.")

  df = pd.read_csv(file_path)


All CSV files from C:\Users\laura\OneDrive\Documents\capstone-dooley\data\antibiotic usage\prescriptions have been combined and saved to clean_data as antibiotic_usage.csv.
All CSV files from C:\Users\laura\OneDrive\Documents\capstone-dooley\data\resistance have been combined and saved to clean_data as resistance.csv.
All CSV files from C:\Users\laura\OneDrive\Documents\capstone-dooley\data\antibiotic usage\saar have been combined and saved to clean_data as saar.csv.


## summary of the new files 

In [None]:
def summarize_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Get the number of rows and columns
    num_rows, num_columns = df.shape
    
    # Get column names and data types
    column_info = df.dtypes

     
    # Get summary statistics for numerical columns
    summary_stats = df.describe()
    
    # Print the summary
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}")
    print("\nColumn Information:")
    print(column_info)
    print("\nSummary Statistics for Numerical Columns:")
    print(summary_stats)

# Example usage
file_path = r'C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\antibiotic_usage.csv'
file_path2 = r'C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\resistance.csv'
file_path3 = r'C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\saar.csv'

summarize_csv(file_path)

summarize_csv(file_path2)

summarize_csv(file_path3)

Number of rows: 306
Number of columns: 4

Column Information:
ID State                              int64
State                                object
Year                                  int64
Prescriptions per 1,000 Enrollees     int64
dtype: object
Empty values in each column:
ID State                             0
State                                0
Year                                 0
Prescriptions per 1,000 Enrollees    0
dtype: int64
Rows with empty values:
Empty DataFrame
Columns: [ID State, State, Year, Prescriptions per 1,000 Enrollees]
Index: []

Summary Statistics for Numerical Columns:
         ID State         Year  Prescriptions per 1,000 Enrollees
count  306.000000   306.000000                         306.000000
mean    28.960784  2019.500000                         933.261438
std     15.702514     1.710623                         396.925195
min      1.000000  2017.000000                         348.000000
25%     16.000000  2018.000000                         621.

## data adjustments 
1. saar.csv  - needs to switch data type, remove zeros, and add the SAAR value (observed days/predicted days) in a new column 


In [25]:
# Load the CSV file into a DataFrame

df = pd.read_csv(file_path3)

# Ensure there are no zero values in 'Predicted Antimicrobial Days'
if (df['Predicted Antimicrobial Days'] == 0).any():
    print("Warning: Found zero values in 'Predicted Antimicrobial Days'. These will be replaced with NaN.")
    df['Predicted Antimicrobial Days'].replace(0, pd.NA, inplace=True)

# Add a new column named 'SAAR' while handling division safely
df['SAAR'] = df['Observed Antimicrobial Days'] / df['Predicted Antimicrobial Days']

# Convert the 'SAAR' column to int64 (dropping NaN before conversion)
df['SAAR'] = df['SAAR'].round(4)

output_file_path = r"C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\updated_saar.csv"
df.to_csv(output_file_path, index=False)

print(f"SAAR column added, converted to int64, and saved to {output_file_path}")



SAAR column added, converted to int64, and saved to C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\updated_saar.csv


2. resistance.csv - The agecat contains all/adult/peds. All needs to be removed since it is the combination of adult and peds.  

In [24]:
df = pd.read_csv(file_path2)

# Remove rows where the 'agecat' column is "All"
df = df[df['agecat'] != "All"]

# Save the updated DataFrame to a new file
output_file_path = r"C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\updated_resistance.csv"
df.to_csv(output_file_path, index=False)

print(f"Rows with 'All' in the 'agecat' column removed and saved to {output_file_path}")

Rows with 'All' in the 'agecat' column removed and saved to C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\updated_resistance.csv


3. combined all 3 files. saar.csv and resistance.csv has agecat, year, and state. Then will combine that with year and state on the antibiotic usage.csv. Will be using the updated versions of csv files 

In [34]:

# Load the CSV files
file1 = pd.read_csv(r"C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\updated_resistance.csv")
file2 = pd.read_csv(r"C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\updated_saar.csv")
file3 = pd.read_csv(r"C:\Users\laura\OneDrive\Documents\capstone-dooley\clean_data\antibiotic_usage.csv")

# Merge the first two files on the common columns
merged_df = pd.merge(file1, file2, on=["agecat", "year", "state"], how="outer")

# Merge the result with the third file, which only has 'year' and 'state'
final_df = pd.merge(merged_df, file3, on=["year", "state"], how="outer")

# Save the final combined DataFrame
final_df.to_csv("combined_file.csv", index=False)


summarize_csv(r"C:\Users\laura\OneDrive\Documents\capstone-dooley\combined_file.csv")

Number of rows: 121260
Number of columns: 18

Column Information:
phenotype                             object
state                                 object
eventtype                             object
year                                   int64
agecat                                object
numTested                             object
numNonSuscep                          object
pctNonSuscep                          object
Suppress                              object
displayTested                         object
Agent                                 object
ID State_x                           float64
Observed Antimicrobial Days          float64
Predicted Antimicrobial Days         float64
Days Present                         float64
SAAR                                 float64
ID State_y                           float64
Prescriptions per 1,000 Enrollees    float64
dtype: object
Empty values in each column:
phenotype                               620
state                                

  df = pd.read_csv(file_path)
