In [None]:
import pandas as pd

from sqlalchemy import create_engine

def load_data_from_db(database_name, table_name):
    engine = create_engine(f'sqlite:///{database_name}')
    df = pd.read_sql_table(table_name, engine)

    return df

messages_merged_trans = load_data_from_db('disaster.db', 'message_category')

messages_merged_trans.head()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# from nltk
def tokenize(text):
    stop_words = set(stopwords.words('english'))

    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]

    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


tokenize("Weather update - a cold front from Cuba that could pass over Haiti")

In [None]:
import nltk

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression


class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

def build_model():
    pipeline = Pipeline([
        # ('features', FeatureUnion([
        #
        #     ('text_pipeline', Pipeline([
        #         ('vect', CountVectorizer(tokenizer=tokenize)),
        #         ('tfidf', TfidfTransformer())
        #     ])),
        #
        #     ('starting_verb', StartingVerbExtractor())
        # ])),

        ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
        ('moc', MultiOutputClassifier(RandomForestClassifier()))
    ])

    parameters = {
        # 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        # 'clf__n_estimators': [50, 100, 200],
        # 'clf__min_samples_split': [2, 3, 4]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv


In [None]:
def split_data(df_message_cat):
    X_train, X_test, y_train, y_test = train_test_split(df_message_cat['message'].values, df_message_cat.drop(labels=['id', 'message', 'original', 'genre'], axis=1).to_numpy())

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_tes = split_data(messages_merged_trans)

In [None]:
model = build_model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# model.transform(X_test)

In [None]:

def test_pipeline():
    pipeline = Pipeline([
        ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
        # ('moc', MultiOutputClassifier(RandomForestClassifier()))
    ])

    return pipeline

pipeline = test_pipeline()
pipeline.fit(X_train)

print(pipeline.transform(X_train))

In [None]:
X_train.to_numpy()
y_train.to_numpy().shape

In [66]:
from sklearn.metrics import classification_report

for i in range(y_tes.shape[1]):
    cr_y1 = classification_report(y_tes[:,i],y_pred[:,i])
    print(cr_y1)


              precision    recall  f1-score   support

           0       0.72      0.34      0.46      1536
           1       0.82      0.96      0.88      4970
           2       0.50      0.12      0.20        48

    accuracy                           0.81      6554
   macro avg       0.68      0.47      0.52      6554
weighted avg       0.79      0.81      0.78      6554

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5467
           1       0.83      0.45      0.59      1087

    accuracy                           0.89      6554
   macro avg       0.87      0.72      0.76      6554
weighted avg       0.89      0.89      0.88      6554



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6526
           1       0.00      0.00      0.00        28

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      3875
           1       0.78      0.69      0.73      2679

    accuracy                           0.79      6554
   macro avg       0.79      0.78      0.78      6554
weighted avg       0.79      0.79      0.79      6554

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      6048
           1       0.62      0.07      0.12       506

    accuracy                           0.92      6554
   macro avg       0.78      0.53      0.54      6554
weighted avg       0.90      0.92      0.90      6554

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6470
           1       0.00      0.00      0.00        84

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.97      0.99      0.98      6554



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6524
           1       0.00      0.00      0.00        30

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6471
           1       0.00      0.00      0.00        83

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.97      0.99      0.98      6554

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      6251
           1       0.50      0.00      0.01       303

    accuracy                           0.95      6554
   macro avg       0.73      0.50      0.49      6554
weighted avg       0.93      0.95      0.93      6554

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6478
           1       0.00      0.00      0.00        76

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5936
           1       0.89      0.76      0.82       618

    accuracy                           0.97      6554
   macro avg       0.93      0.88      0.90      6554
weighted avg       0.97      0.97      0.97      6554

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6421
           1       0.80      0.06      0.11       133

    accuracy                           0.98      6554
   macro avg       0.89      0.53      0.55      6554
weighted avg       0.98      0.98      0.97      6554

              preci

In [67]:
import pickle

def save_model(model, file_path):
    with open(file_path, 'wb') as f:
                pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)


In [68]:
save_model(model, "./models/trained_model")