In [1]:
import pandas as pd
import requests
from io import StringIO
import seaborn as sns
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time




def load_data(api_url):
    response = requests.get(api_url)

    if response.status_code == 200:
        # Access the CSV data from the response
        csv_data = response.content.decode('utf-8')
        csv_io = StringIO(csv_data)

        # Convert the CSV data to a Pandas DataFrame
        df = pd.read_csv(csv_io)
        return df
    else:
        print("Error:", response.status_code)
        return None
def preprocess_data(df):
    # Filter the dataset for "Flood" incident type
    filtered_df = df[df['incidentType'] == 'Flood']

    # Drop columns with high null values
    df = filtered_df.drop(['lastIAFilingDate', 'disasterCloseoutDate'], axis=1)

    # Convert dates to datetime format
    df['incidentBeginDate'] = pd.to_datetime(df['incidentBeginDate'])
    df['incidentEndDate'] = pd.to_datetime(df['incidentEndDate'])
    df['declarationDate'] = pd.to_datetime(df['declarationDate'])

    # Forward fill missing values
    df['incidentEndDate'].fillna(method='ffill', inplace=True)

    return df


def derive_new_features(df):
    # Derive new features
    df['Duration_of_Incident'] = (df['incidentEndDate'] - df['incidentBeginDate']).dt.days
    df['Declared_Programs_Count'] = df[['ihProgramDeclared', 'iaProgramDeclared', 'paProgramDeclared', 'hmProgramDeclared']].sum(axis=1)
    
    # Sort the dataframe by state and declarationDate in ascending order
    df = df.sort_values(by=['declarationDate'])
    #Time Since Last Disaster each state
    df['Time Since Last Disaster'] = df.groupby('state')['declarationDate'].diff().dt.days.fillna(0)
    
    return df

def standardization(df):
    # Apply standardization
    scaler = StandardScaler()
    numerical_columns = ['disasterNumber', 'fyDeclared', 'ihProgramDeclared', 'iaProgramDeclared', 'paProgramDeclared',
                         'hmProgramDeclared', 'tribalRequest', 'fipsStateCode', 'fipsCountyCode', 'placeCode', 'declarationRequestNumber']
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df



def perform_train_test_split(df, target_column, test_size=0.3, random_state=42):
    # Split the dataset into features (X) and target variable (y)
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Split the dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test


def data_processing_pipeline(api_url, target_column):
    # Start the timer for the entire pipeline
    start_time = time.time()

    # perform all the steps
    df = load_data(api_url)
    if df is not None:
        df = preprocess_data(df)

        df = derive_new_features(df)

        df = standardization(df)
        df.to_csv('disaster_declaration_processed.csv', index=False)
        
        X_train, X_test, y_train, y_test = perform_train_test_split(df, target_column)

        # Calculate the total time taken for the pipeline
        total_time = time.time() - start_time
        print("Data processing pipeline completed. Total time taken:", total_time, "seconds")

        return X_train, X_test, y_train, y_test
    else:
        print("Error occurred while loading the data.")

        
# Define the API URL for data retrieval
api_url = "https://www.fema.gov/api/open/v2/DisasterDeclarationsSummaries.csv"

target_column = 'incidentType'

# Call the data_processing_pipeline function
X_train, X_test, y_train, y_test = data_processing_pipeline(api_url, target_column)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

# Save all datasets as CSV files
X_train.to_csv('disaster_X_train.csv', index=False)
X_test.to_csv('disaster_X_test.csv', index=False)
y_train.to_csv('disaster_y_train.csv', index=False)
y_test.to_csv('disaster_y_test.csv', index=False)

Data processing pipeline completed. Total time taken: 0.8789703845977783 seconds
Training set shape: (7505, 25) (7505,)
Testing set shape: (3217, 25) (3217,)
