# Carga y transformación

Usare REGEX para sacar información:

- contiene hora del día
- etc

In [1]:
import os
import pandas as pd
import numpy as np
import constants as ct

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
pd.options.display.width = 1000
pd.options.display.max_colwidth = 200

In [2]:
data_list = os.listdir(ct.DATA_FOLDER)
data_list

['sample_submission.csv',
 'train.csv.zip',
 'train.csv',
 'test.csv',
 'test.csv.zip',
 'sample_submission.csv.zip',
 'store.h5']

In [3]:
csv_list = [name for name in data_list if '.csv' in name and '.zip' not in name]
csv_list

['sample_submission.csv', 'train.csv', 'test.csv']

## Train

In [4]:
train = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'train.csv'), index_col='id')

In [5]:
train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remo...",0,0,0,0,0,0
000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tid...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [6]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 159571 entries, 0000997932d777bf to fff46fc426af1f9a
Data columns (total 7 columns):
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(1)
memory usage: 97.3 MB


In [7]:
train.shape

(159571, 7)

In [8]:
train.comment_text[1]

"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"

In [9]:
X_train = train['comment_text'].to_frame()

In [10]:
X_train.head()

Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remo..."
000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)"
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about..."
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tid..."
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?"


In [11]:
train.columns

Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], dtype='object')

In [12]:
ct.OBJECTIVE_COLS

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [13]:
Y_train = train[ct.OBJECTIVE_COLS]

In [14]:
Y_train.head()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000997932d777bf,0,0,0,0,0,0
000103f0d9cfb60f,0,0,0,0,0,0
000113f07ec002fd,0,0,0,0,0,0
0001b41b1c6bb37e,0,0,0,0,0,0
0001d958c54c6e35,0,0,0,0,0,0


El dataset tiene clases muy desequilibradas, especialmente **threat** y **identity_hate**

In [15]:
Y_train.sum().div(Y_train.shape[0] / 100).sort_values(ascending=False)

toxic            9.584448
obscene          5.294822
insult           4.936361
severe_toxic     0.999555
identity_hate    0.880486
threat           0.299553
dtype: float64

## Test

In [16]:
test = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'test.csv'), index_col='id')

In [17]:
test.head()

Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja...
0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO."
00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """
00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your..."
00017695ad8997eb,I don't anonymously edit articles at all.


In [18]:
test.shape

(153164, 1)

In [19]:
X_test = test

## Full

In [20]:
X_full = pd.concat([X_train, X_test])

In [21]:
X_train.shape[0] + X_test.shape[0]

312735

In [22]:
X_full.shape

(312735, 1)

## Guardamos

In [23]:
X_train.to_hdf(ct.STORE_PATH, 'X_train')
Y_train.to_hdf(ct.STORE_PATH, 'Y_train')
X_test.to_hdf(ct.STORE_PATH, 'X_test')
X_full.to_hdf(ct.STORE_PATH, 'X_full')

In [24]:
hdf_store = pd.HDFStore(ct.STORE_PATH)
hdf_store.keys()

['/X_full', '/X_test', '/X_train', '/Y_train', '/flame_example']

# Ingeniería de atributos


### Ideas:

- Enriquecer con:
    - Lista con palabras tóxicas en inglés
    - Lista con palabras obscenas en inglés
    - Lista con palabras de amenaza en inglés
    - Lista con palabras de odio en inglés
    
- Proceso de clustering incluyendo el test http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
- Crear atributos estadísticos:
    - Contar número de tabulaciones y cambios de línea ('\n', '\t' )
    - Extraer la hora en caso de estar disponible
    - Longitud de mensaje
    - Número de mayúsculas y minúsculas
    - Número de palabras completas en mayúsculas
    - Repetición de palabras
    - Palabras en listas sacadas
    
- Filtrar stopwords `nltk.corpus import stopwords`
- Stemmer de palabras `from nltk.stem import SnowballStemmer`

### Kaggles con ideas:

- https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings

# Exploración de comentarios

In [25]:
some_toxicity_index = Y_train.loc[Y_train.sum(axis=1) >= 1].index.to_series()

index_sample = some_toxicity_index.sample(200, random_state=0)

for row_index in index_sample:
    
    print('*' * 120)
    print(Y_train.loc[row_index])
    print('\n')
    print(X_train.loc[row_index].comment_text)
    print('-' * 120)
    print('\n \n')

************************************************************************************************************************
toxic            1
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: ada49191ec6f498e, dtype: int64


Nitpicking
If someone is complaining about something stupid, just ignore them. -
------------------------------------------------------------------------------------------------------------------------

 

************************************************************************************************************************
toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    0
Name: 73ee076f8b003d51, dtype: int64


What the fuck is wrong with you? You say I am a fucking vandal and I havn't done anything. Your bot is fucked up.
------------------------------------------------------------------------------------------------------------------------

 

*******

In [26]:
flame_example = X_train.loc[index_sample]
flame_example.shape

(200, 1)

In [27]:
flame_example.head()

Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
ada49191ec6f498e,"Nitpicking\nIf someone is complaining about something stupid, just ignore them. -"
73ee076f8b003d51,What the fuck is wrong with you? You say I am a fucking vandal and I havn't done anything. Your bot is fucked up.
9f73d50326172b7d,screw you\ni do what i want
d4f197244abaef7a,"This culture allows people to hold their wives hostage for their dowry, burn them to death, and then remarry more young women from the same family so they can do it all over again. These people c..."
40465e543368eaf4,a virgin. Nighty night


In [28]:
flame_example.to_hdf(ct.STORE_PATH, 'flame_example')