In [5]:
import sys
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle

from sqlalchemy import create_engine

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier


def load_data(database_filepath):
   
    table_name = "disaster_respone"
    
    engine = create_engine('sqlite:///' + database_filepath)
    
    df = pd.read_sql_table(table_name, con=engine)
    
    df = df.iloc[:200]
    
    X = df["message"].values
    
    y = df.iloc[:,5:]
    
    category_names =  y.columns

    return X, y, category_names

class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            try:
                first_word, first_tag = pos_tags[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return True
            except:
                return False
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)


def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    #remove all special characters
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def build_model():
    '''
    build the GridSearchCV Model via usage of a pipeline and defines the parameters for the moden 
            Parameters:
                    None
            Returns:
                    cv(obj): GridSearchCV model 
    '''
    rfc = RandomForestClassifier()
    classifier = MultiOutputClassifier(rfc)
    
    # define Pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier)])

    # define parameters
    parameters = {
        'clf__estimator__min_samples_leaf': [50],
        'clf__estimator__min_samples_leaf': [2],
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=4)

    return cv

    


def evaluate_model(model, X_test, Y_test, category_names):
    y_pred = model.predict(X_test)
    print(classification_report(Y_test, y_pred, target_names=category_names))


def save_model(model, model_filepath):
    pickle.dump(model, open(model_filepath, 'wb'))


database_filepath, model_filepath =  "../Data/DisasterResponse.db", "classifier.pkl"
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# print('Building model...')
# model = build_model()

# print('Training model...')
# model.fit(X_train, Y_train)

# print('Evaluating model...')
# evaluate_model(model, X_test, Y_test, category_names)

# print('Saving model...\n    MODEL: {}'.format(model_filepath))
# save_model(model, model_filepath)

print('Trained model saved!')





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stefa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stefa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\stefa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Loading data...
    DATABASE: ../Data/DisasterResponse.db
Trained model saved!


In [2]:
 pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', RandomForestClassifier())
    ])

In [9]:
try:
    a = 0
except:
    True

In [18]:
# evaluate_model(model, X_test, Y_test, category_names)

In [20]:
Y_test.shape

(40, 36)

In [48]:
table_name = "disaster_respone"
    
engine = create_engine('sqlite:///' + database_filepath)

df = pd.read_sql_table(table_name, con=engine)

In [43]:
["message_len"] = df["message"].str.len()

In [54]:
df = df[df.message.str.len() >  10]

Unnamed: 0,index,id,message,original,genre,related,request,offer,aid_related,medical_help,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report


In [50]:
df

Unnamed: 0,index,id,message,original,genre,related,request,offer,aid_related,medical_help,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
2,2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26023,26381,30261,The training demonstrated how to enhance micro...,,news,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26024,26382,30262,A suitable candidate has been selected and OCH...,,news,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26025,26383,30263,"Proshika, operating in Cox's Bazar municipalit...",,news,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26026,26384,30264,"Some 2,000 women protesting against the conduc...",,news,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
