In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
 #   for filename in filenames:
  #      print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

In [6]:
df.shape

In [7]:
df.head()

In [8]:
df.isna().sum()

In [9]:
print(f'toxic: {df.toxic.sum()}')
print(f'severe_toxic: {df.severe_toxic.sum()}')
print(f'obscene: {df.obscene.sum()}')
print(f'threat: {df.threat.sum()}')
print(f'insult: {df.insult.sum()}')
print(f'identity_hate: {df.identity_hate.sum()}')

In [249]:
df.severe_toxic.unique()

In [173]:
toxic_only = df[(df['toxic'] == 1) & (df['severe_toxic']== 0) & (df['obscene']== 0) & (df['threat']== 0) 
                & (df['insult']== 0) & (df['identity_hate']== 0)]
toxic_only

In [41]:
severe_toxic_only = df[(df['toxic'] == 0) & (df['severe_toxic']== 1) & (df['obscene']== 0) & (df['threat']== 0) 
                & (df['insult']== 0) & (df['identity_hate']== 0)]
severe_toxic_only.shape

In [34]:
obscene_only = df[(df['toxic'] == 0) & (df['severe_toxic']== 0) & (df['obscene']== 1) & (df['threat']== 0) 
                & (df['insult']== 0) & (df['identity_hate']== 0)]
obscene_only.shape

In [35]:
threat_only = df[(df['toxic'] == 0) & (df['severe_toxic']== 0) & (df['obscene']== 0) & (df['threat']== 1) 
                & (df['insult']== 0) & (df['identity_hate']== 0)]
threat_only.shape

In [36]:
insult_only = df[(df['toxic'] == 0) & (df['severe_toxic']== 0) & (df['obscene']== 0) & (df['threat']== 0) 
                & (df['insult']== 1) & (df['identity_hate']== 0)]
insult_only.shape

In [37]:
identity_hate_only = df[(df['toxic'] == 0) & (df['severe_toxic']== 0) & (df['obscene']== 0) & (df['threat']== 0) 
                & (df['insult']== 0) & (df['identity_hate']== 1)]
identity_hate_only.shape

In [38]:
none_toxic = df[(df['toxic'] == 0) & (df['severe_toxic']== 0) & (df['obscene']== 0) & (df['threat']== 0) 
                & (df['insult']== 0) & (df['identity_hate']== 0)]
none_toxic.shape

In [40]:
none_toxic.shape[0] / df.shape[0] *100

### 89% of data is none toxic, the data is also super unbalanced

### data to be used for training will be 6000 non toxic, 5600 toxic and 300 insult

In [196]:
df_non_toxic = none_toxic[0:6000]
df_toxic = toxic_only[0:5600]
df_insult = insult_only[0:300]

In [197]:
train_df = pd.concat([df_non_toxic, df_toxic,df_insult], ignore_index=True)
train_df.head(10)

In [203]:
# shuffle the data
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [204]:
train_df

# preprocessing with Spacy

In [12]:
import spacy

In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
def preprocessing(sentence):

    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

In [205]:
train_df.comment_text[42]

In [206]:
prep_sentence = preprocessing(train_df.comment_text[42])
prep_sentence

In [210]:
def clean_sentence(text):
    texts = []
    for i in text:
        i = i.strip('\n\n')
        i = i.strip('/')
        i = i.replace(' ','')
        i =  i.lower()
        if (i != ""):
            texts.append(i)
    return texts

In [211]:
clean = clean_sentence(prep_sentence)
print(clean)

In [243]:
comment_texts = []
for i in train_df.comment_text[0:10]:
    i = preprocessing(i)
    i = clean_sentence(i)
    comment_texts.append(i)
print(comment_texts)
    

### preprocess and clean the data

In [244]:
def prep_text(df):
    comment_texts = []
    for i in df.comment_text:
        i = preprocessing(i)
        i = clean_sentence(i)
        comment_texts.append(i)
    return comment_texts

In [245]:
cleaned_comments = prep_text(train_df)

In [248]:
#cleaned_comments

#### get pretrained word embeddings

In [229]:
!wget --no-check-certificate \
    http://nlp.stanford.edu/data/glove.6B.zip \
    -O /tmp/glove.6B.zip

In [None]:
with zipfile.ZipFile('/tmp/glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('/tmp/glove')

In [None]:
embeddings_index = {}
f = open('/tmp/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [223]:
embedding_matrix = np.zeros((len(word_index) + 1, max_length))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            max_length,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

### create model

#### train model

In [None]:
x_train = cleaned_comments
y_train = clean_df_train.drop(['id','comment_text'],axis = 1)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [166]:
import torch
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook