# NLP on disaster tweets - Kaggle Competition

## Imports organized in one cell

In [2]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, LeaveOneOut, KFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression, PassiveAggressiveRegressor, LinearRegression, RidgeClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

KeyboardInterrupt: 

## Loading data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Data structure

In [None]:
print('Train has {0} rows and {1} columns'.format(train.shape[0], train.shape[1]))
print('Test has {0} rows and {1} columns'.format(test.shape[0], test.shape[1]))

In [None]:
positive_prcnt = train['target'].value_counts()[0]/train['target'].count()*100
negative_prcnt = train['target'].value_counts()[1]/train['target'].count()*100
print('Percentage of positive cases: {0}% '.format(round(positive_prcnt, 2)))
print('Percentage of negative cases: {0}% '.format(round(negative_prcnt, 2)))

## Loading english stopwords 

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))

## Adding punctuation on stopwords

In [None]:
for punct in punctuation:
    stopwords.add(punct)

## Creating TD-IDF object

In [None]:
SEED = 123123
np.random.seed(SEED)
vect = TfidfVectorizer(lowercase=True, max_features=100, ngram_range=(1,2))
raw_tfidf = vect.fit_transform(train.text)
# treino, teste, classe_treino, classe_teste = train_test_split(tfidf_bruto, train.target)
classes = train.target

In [None]:
def validador_de_modelos(model, train, test, class_train, class_test, cross_validation=0.2):
    model.fit(train, class_train)
    
    predict = model.predict(test)
    
    print("O modelo passado teve acuracia de {0}".format(model.score(test,class_test)))

In [None]:
def cross_validation(model, x, y, validation_splitter):
    print(model)
    print('\n')
    print(validation_splitter)
    cv = cross_validate(model, x, y, cv=validation_splitter)
    st_deviation = cv['test_score'].std()
    mean  = cv['test_score'].mean()
    conf_interval = [(mean * st_deviation)*100, (mean * st_deviation)*100]

    print('Confident interval: [{0}% , {1}%]'.format(round(conf_interval[0],2), round(conf_interval[1], 2)))
    print('\n')

## Instantiating models

In [None]:
rc = RidgeClassifier()
lor = LogisticRegression()
par = PassiveAggressiveRegressor()
dtr = DecisionTreeRegressor()
dtc = DecisionTreeClassifier()

classifiers_list= [rc, lor, par, dtr, dtc]

In [14]:
kf = KFold(shuffle=True, n_splits=100)
ss = ShuffleSplit(n_splits=10)

split_list = [kf, ss]

NameError: name 'KFold' is not defined

## Testing models with cross_validate

In [None]:
for classifier in classifiers_list:
    for splitter in split_list:
        cross_validation(classifier, raw_tfidf, classes, splitter)