In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve

import re
import string
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
lemma = WordNetLemmatizer()

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding

In [2]:
train_df = pd.read_csv('../input/processedcsv-toxic-comments/final_data.csv')
train_df.head()

Unnamed: 0,comments,targets
0,understand sentence several authors criticised...,0
1,thanks dont really mind attacks neighbor small...,0
2,29 october 2007 utc spinout article therefore ...,0
3,2010 formula one season,0
4,welcome hello wikipedia im one thousands edito...,0


In [3]:
train_df.shape

(45694, 2)

In [4]:
train_df.dropna(inplace=True)
train_df.shape

(45688, 2)

In [5]:
embeddings_index = dict()
f = open('../input/glove-embeddings/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Done!!')

Done!!


In [6]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

In [7]:
max_len_tweet = train_df.comments.apply(lambda x: len(x.split())).max()

tok = Tokenizer()
tok.fit_on_texts(train_df.comments)
vocab_size = len(tok.word_index) + 1
encoded_tweet = tok.texts_to_sequences(train_df.comments)
padded_tweet = pad_sequences(encoded_tweet, maxlen=max_len_tweet, padding='post')

vocab_size = len(tok.word_index) + 1

tweet_embedding_matrix = np.zeros((vocab_size, 300)) #for 300 dimensions
for word, i in tok.word_index.items():
    t_embedding_vector = embeddings_index.get(word)
    if t_embedding_vector is not None:
        tweet_embedding_matrix[i] = t_embedding_vector

In [8]:
model = Sequential()

model.add(Embedding(vocab_size, 300, input_length=max_len_tweet, weights=[tweet_embedding_matrix], trainable=True))

model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1))

model.summary()

2021-12-29 06:39:17.696875: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 06:39:17.698622: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 06:39:17.699711: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 06:39:17.700912: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1250, 300)         27639900  
_________________________________________________________________
lstm (LSTM)                  (None, 1250, 256)         570368    
_________________________________________________________________
dropout (Dropout)            (None, 1250, 256)         0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1250, 256)         1024      
_________________________________________________________________
lstm_1 (LSTM)                (None, 1250, 128)         197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1250, 128)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 1250, 64)          4

In [9]:
model.compile(loss='mse',
              optimizer='adam',
              metrics=['accuracy', 'mae']
             )

model_callbacks = [
    keras.callbacks.EarlyStopping(patience=3),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor = 0.1, patience=3, min_lr=1e-01),
    keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss', )
]

model.fit(padded_tweet, train_df.targets,
          epochs=1,
          batch_size= 128,
          callbacks=model_callbacks,
          validation_split=0.2,
          shuffle=True,
          verbose = 1
         )

2021-12-29 06:39:28.767125: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-29 06:39:35.553210: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005




<keras.callbacks.History at 0x7f8d23031bd0>

In [10]:
results = pd.DataFrame()
test_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
results['comment_id'] = test_df['comment_id']
test_df.head()

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


In [11]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [12]:
test_df['text'] = test_df['text'].apply(lambda x: remove_URL(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_emoji(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_html(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_punct(x))

test_df['text'] = test_df['text'].apply(word_tokenize)

test_df['text'] = test_df['text'].apply(lambda x: [word.lower() for word in x])

test_df['text'] = test_df['text'].apply(lambda x: [word for word in x if word not in set(nltk.corpus.stopwords.words('english'))])

test_df['text'] = [' '.join(map(str, l)) for l in test_df['text']]

print('Done')

Done


In [13]:
max_len_test = test_df.text.apply(lambda x: len(x.split())).max()

tok_test = Tokenizer()
tok_test.fit_on_texts(test_df.text)
vocab_size_test = len(tok_test.word_index) + 1
encoded_test = tok_test.texts_to_sequences(test_df.text)
padded_test = pad_sequences(encoded_test, maxlen=max_len_test, padding='post')

vocab_size_test = len(tok_test.word_index) + 1

In [14]:
preds = model.predict(padded_test)
results['score'] = preds
results.to_csv('/kaggle/working/submission.csv', index=False)
print('Done')

Done
