<a href="https://colab.research.google.com/github/JohnnySunkel/BlueSky/blob/master/NLP%20disaster%20tweets%20classification%20logistic%20regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

stop = stopwords.words('english')

df = pd.read_csv('nlp_disaster_tweets.csv')

X_train = df.loc[:6090, 'text'].values
y_train = df.loc[:6090, 'target'].values
X_test = df.loc[6090:, 'text'].values
y_test = df.loc[6090:, 'target'].values

porter = PorterStemmer()

def tokenizer(text):
  return text.split()

def tokenizer_porter(text):
  return[porter.stem(word) for word in text.split()]

tfidf = TfidfVectorizer(strip_accents = None,
                        lowercase = False,
                        preprocessor = None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                  tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf',
                      LogisticRegression(random_state = 0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf,
                           param_grid,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose = 1,
                           n_jobs = -1)

gs_lr_tfidf.fit(X_train, y_train)

print('Best parameter set: %s' % gs_lr_tfidf.best_params_)

print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_

print('Test Accuracy: %.3f' % clf.score(X_test, y_test))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  3.2min finished


Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer_porter at 0x7f0bfcf09c80>}
CV Accuracy: 0.711
Test Accuracy: 0.804
