# ETL Pipeline Preparation

### 1. Importing libraries and loading datasets.

In [1]:
#Data Science Libraries
import pandas as pd
import numpy as np
#Regex
import re
#Connectivity with sql Database
from sqlalchemy import create_engine

#### Loading messages dataset

In [2]:
messages = pd.read_csv('messages.csv')
messages.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


#### Loading categories dataset

In [3]:
categories = pd.read_csv('categories.csv')
categories.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...


### 2. Merging datasets - on unique column "id"

In [4]:
df = pd.merge(messages, categories, on = 'id')
df.head()

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...


### 3. Spliting `categories` into separate category columns.


In [5]:
categories = df.categories.str.split(';',expand = True)
categories.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


#### Renaming columns

In [6]:
row = categories.iloc[[0]]

for i, element in row.copy().iteritems():
    new_element = re.sub(r"[^a-zA-Z_]","",element[0])
    row[i] = new_element
    
category_colnames = row.values[0]
categories.columns = category_colnames
categories.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row[i] = new_element


Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


### 4. Converting category values ones and zeros.


In [7]:
for column in categories:
    categories[column] = categories[column].str[-1:]
    categories[column] = categories[column].astype('int')
    
categories.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Double check if there are only zeros and ones

In [8]:
def check_for_true_false(df, threshold = 0.05):
    """
    This function checks if there are only ones and zeros given in 
    the data frame

    Parameters
    ----------
    df : considered data frame.
    threshold : Sets the maximum ratio between all rows and rows with True
    and False only which allows to simply drop the rows with other values
    The default is 0.05. (5%)

    Returns considered data frame -> df
    -------
    """
    #Just to be sure to iterrate always over the same data frame it is 
    #justified to make an extra copy for the "for loop":

    df2 = df.copy()

    for category in df:

        if len(df2[category].value_counts().index) == 1:
            print("category {} not useful for predictions\
since only one possible value is given".format(category))
            df.drop(columns = category, axis = 1, inplace = True)

        if len(df2[category].value_counts().index) > 2:
            #Data Frame with zeros and ones only:
            tf_only = df[category].value_counts().sort_index()[:2]
            #Data Frame with all values:
            all_values = df[category].value_counts().sort_index()
            #Checking the ration of tf_values to all values
            #Is the ratio below threshold, then we can drop the excess
            tf_only_sum = tf_only.sum()
            all_values_sum = all_values.sum()
            ratio = 1 - (tf_only_sum / all_values_sum)
            print("Too many labels in category {}!".format(category))
            print("\n")
            print("The number of all rows is {}\
, the number of rows with True or False only is {} -> hence the \
ratio of excessive classes values makes {}".format(all_values_sum, 
                                              tf_only_sum,
                                              ratio))
            if ratio < threshold:
                for value in list(all_values.index)[2:]:
                    df.replace(value, np.nan, inplace = True)
                df.dropna(subset = [category], inplace = True)

                print("Excessive labels have been dropped")
    return df

In [9]:
check_for_true_false(categories)

Too many labels in category related!


The number of all rows is 26386, the number of rows with True or False only is 26182 -> hence the ratio of excessive classes values makes 0.00773137269764268
Excessive labels have been dropped
category child_alone not useful for predictionssince only one possible value is given


Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26381,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26382,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26383,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26384,1.0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### 5. Replacing `categories` column in `df` with new category columns.

In [10]:
df.drop(columns = 'categories', inplace = True)
df.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [11]:
df = pd.concat([df.copy(), categories], axis = 1)

In [12]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 6. Removing duplicates

In [13]:
df.duplicated().sum()

170

In [14]:
df.drop_duplicates(inplace = True)

In [15]:
df.duplicated().sum()

0

### 7. Removing NaNs and not necessary columns

In [16]:
df.drop(columns = ['original', 'genre'], axis = 1, inplace = True)

In [17]:
df.dropna(inplace = True)

#### Checking if there are no singular classes (with only one value beeing either True or False for all of the records 

In [18]:
for column in df.iloc[:,3:]:
    if len(df[column].value_counts()) < 2:
        print(column+" has only one label!")
        print("The feature will be removed")
        df.drop(columns = column, axis = 1, inplace = True)
    else:
        print('{:10}'.format(column))
        print('{:>10}'.format("Contains both True and False values - Ok!"))
        print("\n")

request   
Contains both True and False values - Ok!


offer     
Contains both True and False values - Ok!


aid_related
Contains both True and False values - Ok!


medical_help
Contains both True and False values - Ok!


medical_products
Contains both True and False values - Ok!


search_and_rescue
Contains both True and False values - Ok!


security  
Contains both True and False values - Ok!


military  
Contains both True and False values - Ok!


water     
Contains both True and False values - Ok!


food      
Contains both True and False values - Ok!


shelter   
Contains both True and False values - Ok!


clothing  
Contains both True and False values - Ok!


money     
Contains both True and False values - Ok!


missing_people
Contains both True and False values - Ok!


refugees  
Contains both True and False values - Ok!


death     
Contains both True and False values - Ok!


other_aid 
Contains both True and False values - Ok!


infrastructure_related
Contains both True and

### 8. Saving the clean dataset into an sqlite database

In [19]:
engine = create_engine('sqlite:///disaster_database.db')
c = engine.connect()
conn = c.connection

df.to_sql('Messages', con = conn, index=False, if_exists = 'replace')