# ML Pipeline Preparation

**1. Import libraries and load data from database.**

In [1]:
# Import libraries

import nltk
nltk.download(['punkt', 'wordnet'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\izzit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\izzit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Import more libraries

import re
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, precision_score, recall_score
#from sklearn.metrics import precision_score
#from sklearn.metrics import recall_score
#from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Load data from database

engine = create_engine('sqlite:///disasterMessage.db')
df = pd.read_sql("SELECT * FROM myMessage", engine)
X = df.message.values
y = df.drop(columns =['id', 'message', 'genre', 'categories'], axis=1).values
y


array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [4]:
y_columns = list(df.columns[4:,])
y_columns

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

**2. Write a tokenization function to process your text data**

In [5]:
def tokenizes(phrase):

    token = word_tokenize(phrase)
    lemmatize = WordNetLemmatizer()

    tidytoken = []
    for t in token:
        newtoken = lemmatize.lemmatize(t).lower().strip()
        tidytoken.append(newtoken)

    return tidytoken

tokenizes(X[5])

['information', 'about', 'the', 'national', 'palace-']

**3. Build a machine learning pipeline**

In [6]:
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenizes)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultiOutputClassifier(KNeighborsClassifier()))])



**4. Train pipeline**

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train pipeline
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenizes at 0x0000020A65CFC790>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier()))])

**5. Test your model**

In [None]:
pred = pipeline.predict(X_test)