In [None]:
'''
Preprocessing data 
Disaster response pipeline project
Data science nanodegree, Udacity 

Script execution:
python process_data.py, disaster_messages.csv, disaster_categories.csv, DisasterResponse.db

Arguments:
    a) Csv file containing messages data (disaster_messages.csv)
    b) Csv file containing categories data (disaster_categories.csv)
    c) Sqlite destination database (DisasterResponse.db)
'''

In [None]:
#import modules 

import sys
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
 
#functions 

def load_data(messages_filepath, categories_filepath):
    """
    Load data function
    
    Arguments:
        messages_filepath -> path to the csv file with messages 
        categories_filepath -> path to the csv file with categories 
    Output:
        df -> pandas dataframe
    """
    # load messages dataset
    messages = pd.read_csv('messages.csv')
    
    # load categories dataset
    categories = pd.read_csv('categories.csv')
   
    # merge datasets
    df =  messages.merge(categories, on= 'id', how = 'inner')
    
    return df 

def clean_data(df):
    """
    Clean data function
    
    Arguments:
        df -> dirty data pandas dataframe
    Outputs:
        df -> clean data pandas dataframe
    """
    # create a dataframe of the 36 individual category columns
    categories = df.categories.str.split(pat=';', expand=True)

    # select the first row of the categories dataframe
    row = categories.iloc[0,:]
    category_colnames = row.apply(lambda x:x[:-2])

    # rename the columns of `categories`
    categories.columns = category_colnames

    #convert category values to just numbers 0 or 1.
    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].astype(str).str[-1:]

        # convert column from string to numeric
        categories[column] = categories[column].astype(np.int)

    #replace `categories` column in `df` with new category columns.
    # drop the original categories column from `df`
    df.drop('categories',axis=1)

    # concatenate the original dataframe with the new `categories` dataframe
    df = pd.concat([df,categories], join='inner', axis=1)
    
    # drop duplicates
    df = df.drop_duplicates()

    return df

def save_data(df, database_filename):
    """
    Save data function
    
    Arguments:
        df -> clean data Pandas DataFrame
        database_filename -> database file (.db) destination path
    """
    engine = create_engine('sqlite:///'+ database_filename)
    df.to_sql('DisasterResponse', engine, index=False)
    
    

def main():
    """
    Main data processing function
    
    Implementation of the 3 main actions to create the ETL pipeline:
        1) Load the csv datasets  
        2) Clean and pre-processing the data
        3) Load and save the data to Sqlite database
    """
    #if the count of arguments is matching to 4, execute the ETL pipeline 
    #print(sys.argv)
    if len(sys.argv) == 4:
        
        #extract the parameters
        messages_filepath, categories_filepath, database_filepath = sys.argv[1:]

        #print messagges
        print('Loading data from...\n  MESSAGES: {} ....\n Loading data from...\n  CATEGORIES: {}'
              .format(messages_filepath, categories_filepath))
        df = load_data(messages_filepath, categories_filepath)

        print('Cleaning categories data...')
        df = clean_data(df)
        
        print('Saving data to SQLite DB ...\n  DATABASE: {}'.format(database_filepath))
        save_data(df, database_filepath)
        
        print('Cleaned data has been saved to database!')
        
        #print an error messagge 
    else:
        print("Please provide arguments correctly: \n\nDatasets as the first and second argument respectively, \n\
as well as the filepath of the database to save the cleaned data to as the third argument. \n\n\
-> Python process_data.py, disaster_messages.csv, disaster_categories.csv, disaster_response_db.db \n\n\
Arguments description: \n\
a) Path to the csv file containing messages (e.g. disaster_messages.csv)\n\
b) Path to the csv file containing categories (e.g. disaster_categories.csv)\n\
c) Path to Sqlite destination database (e.g. disaster_response_db.db)")
        
if __name__ == '__main__':
    main()