# Separando em Treino e Teste

## Caminhos

In [1]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [2]:
import dill
import pickle
import pandas as pd
pd.set_option('max_colwidth', 3000)

import numpy as np

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

from time import gmtime, strftime

import re

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

import unidecode
from unicodedata import normalize

import pygtrie

## Funcoes

In [3]:
with open(pathaux + 'Functions.pickle', 'rb') as f:
    rem_acentos, stem, limpa_nomes, PreProcess, dicMeanSd, Scale, NomesEPalavras = dill.load(f)

## Lendo a Base

In [4]:
data = pd.read_csv(pathin + 'Tweets.csv')

#DataFrame com quantidade de Linhas
sizes = pd.DataFrame({'descricao': ['Original'], 'linhas': [data.shape[0]]})
sizes

Unnamed: 0,descricao,linhas
0,Original,14640


In [5]:
#data['tweet_id']
#data['airline_sentiment'].unique()
#data['negativereason'].unique()
#data['user_timezone'].unique()
#data['retweet_count'].unique()
#data['text']

# Ajustando Variavel Resposta

In [6]:
#Ajustando o nome das variaveis
data = data.rename(columns = {'tweet_id': 'ido', 'airline_sentiment': 'resposta', 'text': 'texto'})
data['resposta'] = data['resposta'].astype(str)
data['respostaoriginal'] = data['resposta']
data['resposta'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [7]:
#Removendo '\n' e '\t',  ficando apenas com letras e numeros

vec = [r'\n', r'\t', r'[^a-zA-Z0-9]']

for i in vec:
    data['resposta'] = data['resposta'].apply(lambda x: re.sub(i, ' ', x))

    
#Removendo Espacos Desnecessarios (duplos, inicio e final de vetor)

data['resposta'] = data['resposta'].apply(lambda x: re.sub(r' +', ' ', x))
data['resposta'] = data['resposta'].apply(lambda x: x.strip())
data['resposta'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [8]:
###############################################################################
#Adicionando Identificador Além do Número de Ocorrência
###############################################################################

data['id'] = list(range(1, data.shape[0] + 1))

data = data[['id', 'ido', 'texto', 'resposta', 'respostaoriginal']]

In [9]:
###############################################################################
#Eliminando as linhas que nao tem variavel resposta
###############################################################################

data = data.dropna(subset = ['resposta'])

sizes = sizes.append({'descricao': 'Excluindo resposta NA', 'linhas': data.shape[0]}, ignore_index = True)

#NA para ''
data = data.fillna('')
data.head(1)

Unnamed: 0,id,ido,texto,resposta,respostaoriginal
0,1,570306133677760513,@VirginAmerica What @dhepburn said.,neutral,neutral


In [10]:
###############################################################################
#Eliminando linhas com menos de x caracteres
###############################################################################

minchar = 10

desc = 'Excluindo Linhas de Textos com Menos de ' + str(minchar) + ' caracteres'

data['nchar'] = data['texto'].apply(len)
data = data[data['nchar'] >= minchar]
sizes = sizes.append({'descricao': desc, 'linhas': data.shape[0]}, ignore_index = True)

In [11]:
###############################################################################
#Eliminando Textos Duplicados
###############################################################################

#Eliminando uma das linhas quando existe duplicacao de ido, texto e resposta

data = data.sort_values(by = ['ido'], ascending = False, na_position = 'first')
data.drop_duplicates(subset = ['ido', 'texto', 'resposta'], keep = 'last', inplace = True)

sizes = sizes.append({'descricao': 'Eliminando Ultima Linha Quando ido, texto e resposta sao repetidos', 'linhas': data.shape[0]}, ignore_index = True)

sizes

Unnamed: 0,descricao,linhas
0,Original,14640
1,Excluindo resposta NA,14640
2,Excluindo Linhas de Textos com Menos de 10 caracteres,14640
3,"Eliminando Ultima Linha Quando ido, texto e resposta sao repetidos",14503


# Parte nao usada no teste: contagem

In [12]:
freq = data.groupby(['resposta']).size().reset_index(name = 'tabfreq')
freq = freq.sort_values(by = ['tabfreq'], ascending = False)
freq['tabfreqrelativa'] = freq['tabfreq'] / data.shape[0]
freq = freq[['resposta', 'tabfreq', 'tabfreqrelativa']]
freq.to_excel(pathparcial + 'Frequencia Respostas Desagrupadas.xlsx', encoding = 'latin1', index = False)
freq

Unnamed: 0,resposta,tabfreq,tabfreqrelativa
0,negative,9089,0.626698
1,neutral,3074,0.211956
2,positive,2340,0.161346


In [13]:
freq = freq.reset_index(drop = True)
freq['respcod'] = 'L' + pd.Series(freq.index).astype(str).str.zfill(3)
freq.to_excel(pathparcial + 'Frequencia Respostas Desagrupadas com Codigo.xlsx', 
              encoding = 'latin1', index = False)
freq

Unnamed: 0,resposta,tabfreq,tabfreqrelativa,respcod
0,negative,9089,0.626698,L000
1,neutral,3074,0.211956,L001
2,positive,2340,0.161346,L002


In [14]:
#Merge da tabela de dados com a de frequencias
data = pd.merge(data, freq, how = 'left', left_on = 'resposta', right_on = 'resposta')

In [15]:
#Equivalencia de Respostas
respequiv = data[['respostaoriginal', 'resposta', 'respcod']]

#Eliminando uma das linhas quando existe duplicacao
respequiv.drop_duplicates(subset = ['respostaoriginal', 'resposta'], keep = 'first', inplace = True)

respequiv.sort_values(by = ['respcod'], ascending = True, na_position = 'last', inplace = True)

freq.to_excel(pathparcial + 'Equivalencia de Respostas Completo.xlsx', encoding = 'latin1', index = False)

respequiv.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


(3, 3)

# Criando novo nome para variavel resposta (se quiser reduzir categorias)

In [16]:
#Minimo de Caracteres
minobs = 50
#Frequencia Relativa Minima
minfreq = 1/100

In [17]:
#Marcando Outros
data['respostafinal'] = np.where((data['tabfreq'] >= minobs) & (data['tabfreqrelativa'] > minfreq), data['respcod'], 'AgrupOutros')

#Equivalencia de respostas agrupadas

#
respequiv = data[['respostaoriginal', 'resposta', 'respostafinal']]

respequiv = respequiv[respequiv['respostafinal'] != 'AgrupOutros']

#Eliminando uma das linhas quando existe duplicacao
respequiv.drop_duplicates(subset = ['respostaoriginal', 'resposta', 'respostafinal'], keep = 'first', inplace = True)

respequiv.sort_values(by = ['respostafinal', 'resposta', 'respostaoriginal'], ascending = True, na_position = 'last', inplace = True)

respequiv.to_excel(pathparcial + 'Linhas com Codigo.xlsx', encoding = 'latin1', index = False)
respequiv.to_pickle(pathaux + 'Linhas com Codigo.pkl')

respequiv.shape

(3, 3)

In [18]:
data = data[['ido', 'id', 'respostafinal', 'resposta', 'nchar', 'texto']]

sizes = sizes.append({'descricao': 'Full', 'linhas': data.shape[0]}, ignore_index = True)

sizes

Unnamed: 0,descricao,linhas
0,Original,14640
1,Excluindo resposta NA,14640
2,Excluindo Linhas de Textos com Menos de 10 caracteres,14640
3,"Eliminando Ultima Linha Quando ido, texto e resposta sao repetidos",14503
4,Full,14503


# Separando em Treino e Teste

In [19]:
y = data['respostafinal']
X = data.drop(columns = ['respostafinal'])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.25, random_state = 123)

sizes = sizes.append({'descricao': 'Treino', 'linhas': X_train.shape[0]}, ignore_index = True)
sizes = sizes.append({'descricao': 'Teste', 'linhas': X_test.shape[0]}, ignore_index = True)

train = pd.merge(X_train, pd.DataFrame(y_train), left_index = True, right_index = True)
teste = pd.merge(X_test, pd.DataFrame(y_test), left_index = True, right_index = True)

## Salvando Bases

In [20]:
train.to_pickle(pathparcial + 'Arquivo0 Treino.pkl')
teste.to_pickle(pathparcial + 'Arquivo0 Teste.pkl')

data.to_excel(pathparcial + 'Arquivo0.xlsx', encoding = 'latin1', index = False)
sizes.to_excel(pathout + 'Tamanhos.xlsx', encoding = 'latin1', index = False)