In [1]:
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine(f"sqlite:///DisasterResponse.db")
sql_query = "SELECT * FROM DisasterResponse"
df = pd.read_sql(sql_query, engine)
df.head()


Unnamed: 0,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
import sys
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path


def load_data(messages_filepath, categories_filepath):
    """
    Load the data and concatenate it
    Args:
        message_filepath: the file path of the messages.csv
        categories_filepath: the file path of the categories.csv
    Returns:
        concatenated df
    """
    # load messages dataset
    messages_filepath = Path(messages_filepath)
    messages = pd.read_csv(messages_filepath)

    # load categories dataset
    categories_filepath = Path(categories_filepath)
    categories = pd.read_csv(categories_filepath)

    # merge datasets
    messages.set_index("id", inplace=True)
    categories.set_index("id", inplace=True)
    df = messages.join(categories)

    return df


def clean_data(df):
    """
    The concatenated messages and categories data is cleaned.
    Categories are splitted and duplicates are deleted.
    Args:
        concatenated df of messages and categories
    Returns:
        concatenated df of messages with splitted categories, without doubles
    """
    # create a dataframe of the 36 individual category columns
    categories = df.categories.str.split(";", expand=True)

    # rename the columns of `categories`
    categories.columns = [
        categories.iloc[0][col].split("-")[0]
        for col in range(categories.shape[1])  # noqa
    ]

    # refactor the data to 1/0
    categories = categories.apply(lambda x: x.str.split("-").str[1])
    
    # Change to ints and set to binary
    categories = categories.astype(int)
    categories.replace(2,1, inplace=True)
    
    # drop the categories column and join the cleaned data
    df.drop(columns="categories", inplace=True)
    df = df.join(categories)

    # drop duplicates
    df.drop_duplicates(inplace=True)

    return df, categories

df1 = load_data('data/disaster_messages.csv', 'data/disaster_categories.csv')
df2, categories = clean_data(df1)

In [37]:

categories.related.value_counts()

1    20246
0     6140
Name: related, dtype: int64

In [5]:
import os
os.chdir('..')