In [14]:
# Import required libraries
import sys
import pandas as pd
from sqlalchemy import create_engine  # to save the clean dataset into an sqlite database
import nltk
import re
import numpy as np
import pickle  # for ML-model export as a pickle file

# NLTK downloads
nltk.download('wordnet') # lexical database of English
nltk.download('punkt') # tokenizer model split text into words
nltk.download('stopwords') # list of common stopword to remove them

# Tokenization function 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Machine learning pipeline
from sklearn.pipeline import Pipeline  # For creating the pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer  # For text processing
from sklearn.multioutput import MultiOutputClassifier  # For multi-output classification
from sklearn.ensemble import RandomForestClassifier  # For the Random Forest classifier

# Training of pipeline
from sklearn.model_selection import train_test_split

# Test training model
from sklearn.metrics import classification_report

# Improve model with Grid Search
from sklearn.model_selection import GridSearchCV  # for using GridSearchCV



def load_data(messages_filepath, categories_filepath):
    """
    Loads and cleans the messages and categories datasets.

    Parameters:
    messages_filepath (str): The file path to the CSV file containing the messages.
    categories_filepath (str): The file path to the CSV file containing the categories.

    Returns:
    tuple: A tuple consisting of:
        - X (pd.Series): The messages (feature).
        - y (pd.DataFrame): The categories (labels), split into separate columns.
    """
       
    # read in data files (two csv files)
    messages = pd.read_csv(messages_filepath) # load messages dataset
    categories = pd.read_csv(categories_filepath) # load categories dataset
    df = pd.merge(messages, categories, left_on='id', right_on='id', how='inner')  # merge datasets

    # clean data
    categories = df["categories"].str.split(';', expand=True) # Split the 'categories' column into separate columns
    categories.columns = categories.iloc[0].str[:-2].values # Extract new column names directly from the first row
    categories = categories[1:] # Drop the first row since it was used for column names

    for column in categories:
        categories[column] = categories[column].str[-1] # set each value to be the last character of the string
        categories[column] = pd.to_numeric(categories[column]) # convert column from string to numeric

    df.drop(['categories'], axis=1, inplace = True) # drop the original categories column from `df`
    df = pd.concat([df, categories], axis=1) # concatenate the original dataframe with the new `categories` dataframe

    df = df.drop_duplicates() # Remove duplicates
    df.fillna(0, inplace=True) # Replace all NaN values with 0

    # load to database / Save the clean dataset into an sqlite database.
    engine = create_engine('sqlite:///DisasterResponseProject.db')
    df.to_sql('DisasterResponses', engine, index=False, if_exists='replace')

    # define features and label arrays
    X = df['message']
    y = df.iloc[:,4:]
    print("load data")
    return X, y



def tokenize(text):    
    
    """
    Tokenizes and lemmatizes the input text.

    Parameters:
    text (str): The input text to be tokenized.

    Returns:
    list: A list of cleaned tokens.
    """
    
    # text processing: tokenization function to process data

    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' # Define a regex pattern to detect URLs
    text = re.sub(url_regex, "urlplaceholder", text) # Replace URLs with a placeholder
    tokens = word_tokenize(text.lower()) # Normalize and tokenize text
    tokens = [w for w in tokens if w not in stopwords.words("english") and w.isalpha()] # Remove stopwords
        
    lemmatizer = WordNetLemmatizer() # Initiate lemmatizer
    clean_tokens = []
    for tok in tokens: # # Iterate through each token
        # Lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)
    print("tokenize")
    return clean_tokens    

    
    
def build_model():
    
    """
    Builds a machine learning pipeline for multi-output classification.

    This function creates a pipeline that processes text data using a 
    CountVectorizer to convert text into a matrix of token counts, 
    followed by a TfidfTransformer to transform the count matrix to 
    a normalized term-frequency or TF-IDF representation. Finally, 
    it applies a MultiOutputClassifier with a RandomForestClassifier 
    as the base estimator to handle multi-label classification tasks.

    Returns:
        Pipeline: A scikit-learn Pipeline object that encapsulates the 
        text processing and classification steps.
    """
    # Build a machine learning pipeline
    machine_learning_pipeline = Pipeline([
        ('cvect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    print("build ml model")
    return machine_learning_pipeline


def train(X, y, model):
    
    """
    Trains the given machine learning model on the provided features and labels.

    Parameters:
    X (pd.Series): The input features for training.
    y (pd.DataFrame): The target labels for training.
    model: The machine learning model to be trained.

    Returns:
    model: The trained machine learning model.
    """
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Specify test_size and random_state
    model.fit(X_train, y_train)  # Fit the model to the training data
    print("train ml model")
    return model



def export_model(model):
    
    """
    Exports the trained machine learning model as a pickle file.

    Parameters:
    model: The trained machine learning model to be exported.
    """
    
    # Export model as a pickle file
    # pickle.dump(machine_learning_pipeline_optimized, open('model.pkl', 'wb'))

    # Export model as a pickle file
    with open('model.pkl', 'wb') as f:
        pickle.dump(model, f)  # Use the model parameter instead of an undefined variable
    print("export model")
    


def run_pipeline(data_file):
    X, y = load_data(data_file)  # run ETL pipeline
    model = build_model()  # build model pipeline
    model = train(X, y, model)  # train model pipeline
    export_model(model)  # save model
    print("run pipeline build, train, export model")
    
#original
#if __name__ == '__main__':
#    data_file = sys.argv[1]  # get filename of dataset
#    run_pipeline(data_file)  # run data pipeline
    
if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("Usage: python script.py <data_file>")
        sys.exit(1)  # Exit if the argument is not provided

    data_file = sys.argv[1]  # get filename of dataset
    run_pipeline(data_file)  # run data pipeline    



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Usage: python script.py <data_file>


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [30]:
# Specify the file paths to your CSV files
messages_filepath = 'messages.csv'
categories_filepath = 'categories.csv'

# Call the load_data function
df, X = load_data(messages_filepath, categories_filepath)

# Print the contents of df and X
print("DataFrame (df):")
print(df.head())  # Print the first few rows of the DataFrame

print("\nFeatures (X):")
print(X.head())  # Print the first few rows of the 'message' column

DataFrame (df):
0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

Features (X):
   related  request  offer  aid_related  medical_help  medical_products  \
0      NaN      NaN    NaN          NaN           NaN               NaN   
1      1.0      0.0    0.0          1.0           0.0               0.0   
2      1.0      0.0    0.0          0.0           0.0               0.0   
3      1.0      1.0    0.0          1.0           0.0               1.0   
4      1.0      0.0    0.0          0.0           0.0               0.0   

   search_and_rescue  security  military  child_alone      ...        \
0                NaN       NaN       NaN          NaN      ...         
1                0.0       0.0       0.0          0.0      ...        

In [16]:
messages_filepath = 'messages.csv'  # Pfad zur Nachrichten-CSV-Datei
categories_filepath = 'categories.csv'  # Pfad zur Kategorien-CSV-Datei

X, y = load_data(messages_filepath, categories_filepath)  # Daten laden
model = build_model()  # Modell erstellen
model = train(X, y, model)  # Modell trainieren