# NLP on disaster tweets - Kaggle Competition

## Imports organized in one cell

In [2]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, LeaveOneOut, KFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression, PassiveAggressiveRegressor, LinearRegression, RidgeClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

## Loading data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Data structure

In [4]:
print('Train has {0} rows and {1} columns'.format(train.shape[0], train.shape[1]))
print('Test has {0} rows and {1} columns'.format(test.shape[0], test.shape[1]))

Train has 7613 rows and 5 columns
Test has 3263 rows and 4 columns


In [5]:
positive_prcnt = train['target'].value_counts()[0]/train['target'].count()*100
negative_prcnt = train['target'].value_counts()[1]/train['target'].count()*100
print('Percentage of positive cases: {0}% '.format(round(positive_prcnt, 2)))
print('Percentage of negative cases: {0}% '.format(round(negative_prcnt, 2)))

Percentage of positive cases: 57.03% 
Percentage of negative cases: 42.97% 


## Loading english stopwords 

In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))

## Adding punctuation on stopwords

In [7]:
for punct in punctuation:
    stopwords.add(punct)

## Creating TD-IDF object

In [8]:
SEED = 123123
np.random.seed(SEED)
vect = TfidfVectorizer(lowercase=True, max_features=100, ngram_range=(1,2))
raw_tfidf = vect.fit_transform(train.text)
# treino, teste, classe_treino, classe_teste = train_test_split(tfidf_bruto, train.target)
classes = train.target

In [9]:
def validador_de_modelos(model, train, test, class_train, class_test, cross_validation=0.2):
    model.fit(train, class_train)
    
    predict = model.predict(test)
    
    print("O modelo passado teve acuracia de {0}".format(model.score(test,class_test)))

In [10]:
def cross_validation(model, x, y, validation_splitter):
    print(model)
    print('\n')
    print(validation_splitter)
    cv = cross_validate(model, x, y, cv=validation_splitter)
    st_deviation = cv['test_score'].std()
    mean  = cv['test_score'].mean()
    conf_interval = [(mean-2 * st_deviation)*100, (mean+2 * st_deviation)*100]

    print('Confident interval: [{0}% , {1}%]'.format(round(conf_interval[0],2), round(conf_interval[1], 2)))
    print('\n')

## Instantiating models

In [11]:
rc = RidgeClassifier()
lor = LogisticRegression()
par = PassiveAggressiveRegressor()
dtr = DecisionTreeRegressor()
dtc = DecisionTreeClassifier()

classifiers_list= [rc, lor, par, dtr, dtc]

In [12]:
kf = KFold(shuffle=True, n_splits=100)
ss = ShuffleSplit(n_splits=10)

split_list = [kf, ss]

## Testing models with cross_validate

In [13]:
for classifier in classifiers_list:
    for splitter in split_list:
        cross_validation(classifier, raw_tfidf, classes, splitter)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)


KFold(n_splits=100, random_state=None, shuffle=True)
Confident interval: [61.97% , 80.57%]


RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)


ShuffleSplit(n_splits=10, random_state=None, test_size=None, train_size=None)
Confident interval: [69.26% , 74.71%]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


KFold(n_splits=100, random_state=None, shuffle=True)
Confident interval: [60.73% ,