**Import Statements**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import seaborn as sns
from tqdm import tqdm,tqdm_notebook
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
from transformers import *
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import Data**

In [None]:
training_csv = pd.read_csv('/kaggle/input/dontpatronizemepcl/dontpatronizeme_pcl.tsv', skiprows=3, sep='\t', names=["paragraph_id", "keyword", "country_code", "paragraph", "label"])
#training_csv.columns = ["paragraph_id", "keyword", "country_code", "paragraph", "label"]
df = training_csv.drop(["paragraph_id", "keyword"], axis = 1, inplace = False)
df

**Exploratory Data Analysis**

In [None]:
hist = df.hist(bins=5)

**Data Processing**

In [None]:
stop = stopwords.words('english')

# Use English stemmer.
stemmer = SnowballStemmer("english")

# Define Lemmatizer
lemmatizer = WordNetLemmatizer()

#df = training_csv

def clean_text(col):
    cleaned_col = col + "_cleaned"

    #force text to lowercase
    df[cleaned_col] = df[col].str.lower()
    
    #remove url from text
    df[cleaned_col] = df[cleaned_col].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True).replace(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?','',regex = True)
    
    #remove numeric characters from text 
    df[cleaned_col] = df[cleaned_col].str.replace('\d+', '')
    
    #remove punctuation from text
    #df[cleaned_col] = df[cleaned_col].str.replace('[^\w\s]','')
    
    #remove stopwords
    df[cleaned_col] = df[cleaned_col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
    
    #tokenized words
    #token_col = col+"_token"
    
    #df[token_col] = df.apply(lambda row: nltk.word_tokenize(row[cleaned_col]), axis=1)
    
    #stem words
    stem_col_name = col + "_stemmed"
    #df[stem_col_name] = df[cleaned_col].str.split().apply(lambda x: [stemmer.stem(str(word)) for word in x])
    
    #lemmatize words
    lem_col_name = col + "_lemmatize"
    #df[lem_col_name] = df[cleaned_col].str.split().apply(lambda x: [lemmatizer.lemmatize(str(word)) for word in x])
    
    
clean_text("paragraph")

df

In [None]:
df["response"] = df.apply(lambda x: 1 if x["label"] > 1 else 0, axis = 1)
df = df.drop(["paragraph", "label"], axis = 1, inplace = False)

In [None]:
UpSample=False

if UpSample:

    X,Y=RandomOverSampler(random_state=42).fit_resample(data_train.iloc[:,[0,1]],data_train.iloc[:,2])
    data_train=pd.concat((pd.DataFrame(X),pd.DataFrame(Y)),axis=1)
    data_train.columns=['country_code','paragraph_cleaned','response']
    #data_val.columns=['country_code','paragraph_cleaned','response']
    data_test.columns=['country_code','paragraph_cleaned','response']
    data_train

else:
    df.columns=['country_code','paragraph_cleaned','response']
    #data_val.columns=['country_code','paragraph_cleaned','response'] 
    df.columns=['country_code','paragraph_cleaned','response']
    df

In [None]:
train_data,test_data = train_test_split(df, test_size=0.2)
train_data = train_data.drop(train_data[train_data['response'] == 0].sample(frac=.2).index)

In [None]:
#Tokenizing Train Data 
texts = train_data["paragraph_cleaned"]
labels = train_data["response"]

max_words = 10000
maxlen = 100
 #
tokenizer = Tokenizer(num_words = max_words, oov_token = False) #oov_token = True, Unknown words are tokenized as 1
tokenizer.fit_on_texts(texts) #Fit tokenizer on Train data only
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data#[:training_samples]
y_train = labels#[:training_samples]

#Tokenizing Test Data
texts = test_data["paragraph_cleaned"]
labels = test_data["response"]

sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_test = data
y_test = labels

In [None]:
1-y_test.sum()/len(y_test)

In [None]:
texts = df["paragraph_cleaned"]
labels = df["response"]

k = 4
fold = 0
num_validation_samples = len(texts) // k
validation_data = texts[num_validation_samples * fold:num_validation_samples * (fold + 1)]

A = texts[:num_validation_samples * fold]
B = texts[num_validation_samples * (fold + 1):] 
training_data = pd.concat([A,B])

A = labels[:num_validation_samples * fold]
B = labels[num_validation_samples * (fold + 1):] 
training_label = pd.concat([A,B])
 
max_words = 10000
maxlen = 100

tokenizer = Tokenizer(num_words = max_words, oov_token = True) #oov_token = True, Unknown words are tokenized as 1
tokenizer.fit_on_texts(training_data) #Fit tokenizer on Train data only
sequences = tokenizer.texts_to_sequences(training_data)

#print(sequences)
data = pad_sequences(sequences, maxlen=maxlen)
Y = np.asarray(training_label)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
labels = Y[indices]
x_train = data[indices]#[:training_samples]
y_train = Y[indices]#[:training_samples]


In [None]:
def kfold(k,df, model):
    df = df.sample(frac=1, axis=1).reset_index(drop=True)
    texts = df["paragraph_cleaned"]
    labels = df["response"]
    
    num_validation_samples = len(texts) // k
    validation_scores = []    
    for fold in range(k):
        #Selection of Train and Test data 
        validation_data = texts[num_validation_samples * fold:num_validation_samples * (fold + 1)]
        
        validation_label = labels[num_validation_samples * fold:num_validation_samples * (fold + 1)]
        
        A = texts[:num_validation_samples * fold]
        B = texts[num_validation_samples * (fold + 1):] 
        training_data = pd.concat([A,B])
        
        A = labels[:num_validation_samples * fold]
        B = labels[num_validation_samples * (fold + 1):] 
        training_label = pd.concat([A,B])
        #Tokenizing Train Data 
        
        max_words = 10000
        maxlen = 100
        
        tokenizer = Tokenizer(num_words = max_words, oov_token = True) #oov_token = True, Unknown words are tokenized as 1
        tokenizer.fit_on_texts(training_data) #Fit tokenizer on Train data only
        sequences = tokenizer.texts_to_sequences(training_data)
        
        data = pad_sequences(sequences, maxlen=maxlen)
        Y = np.asarray(training_label)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        x_train = data[indices]#[:training_samples]
        y_train = Y[indices]#[:training_samples]
        
        #Tokenizing Test Data 
        
        sequences = tokenizer.texts_to_sequences(validation_data)
        data = pad_sequences(sequences, maxlen=maxlen)
        Y = np.asarray(validation_label)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        x_test = data[indices]
        y_test = Y[indices]

        
        print('Shape of train tensor:', x_train.shape)
        print('Shape of test tensor:', x_test.shape)
        
        #model = get_model()
        #model.train(training_data)
        #validation_score = model.evaluate(validation_data)
        #validation_scores.append(validation_score)
         

In [None]:
model = []
k = 3
kfold(k,df, model)

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

**(1) Basic machine-learning approach**

In [None]:
embedding_dim = 16
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history1 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))



In [None]:
history_dict = history1.history
val_acc_values = history_dict['val_accuracy']
val_acc_values

In [None]:
history_dict = history1.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

**(1.1) Adding dropout layer**

In [None]:
embedding_dim = 32
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history2 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))


In [None]:
history_dict = history2.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
history_dict = history1.history
loss_values1 = history_dict['loss']
val_loss_values1 = history_dict['val_loss']

history_dict = history2.history
loss_values2 = history_dict['loss']
val_loss_values2 = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values1, 'bo', label='Basic')
plt.plot(epochs, loss_values2, 'bo', color='green', label='Basic+dropout')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, 'bo', label='Basic loss')
plt.plot(epochs, val_loss_values2, 'bo', color='green', label='Basic+dropout')
plt.title('Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

**(1.2) Weight Regularization**

In [None]:
from keras import regularizers
embedding_dim = 32
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,kernel_regularizer=regularizers.l1(0.001), activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy',f1_m,precision_m, recall_m])

history2 = model.fit(x_train,
                    y_train,
                    epochs=9,
                    validation_data=(x_test,y_test))

In [None]:
from keras import regularizers
embedding_dim = 32
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,kernel_regularizer=regularizers.l1(0.001), activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=[f1_m,precision_m])

history2 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))

In [None]:
history_dict = history2.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

f1_values = history_dict['f1_m']
f1_val_values = history_dict['val_f1_m']
plt.plot(epochs, f1_values, 'bo', label='Training acc')
plt.plot(epochs, f1_val_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, y_test, verbose = 0)
print('accuracy:', accuracy*100)
print('f1_score:', f1_score*100)
print('precision:', precision*100)
print('recall:', recall*100)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
predictions = model.predict(x_test)
seq_predictions=np.transpose(predictions)[0]
seq_predictions = list(map(lambda x: 0 if x<0.5 else 1, seq_predictions))

print(precision_score(y_test, seq_predictions))
print(recall_score(y_test, seq_predictions))
print(f1_score(y_test, seq_predictions))

In [None]:
df = pd.DataFrame(seq_predictions).T

df.to_csv("toutput.csv")

In [None]:
history_dict = history2.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
history_dict = history1.history
loss_values1 = history_dict['loss']
val_loss_values1 = history_dict['val_loss']

history_dict = history2.history
loss_values2 = history_dict['loss']
val_loss_values2 = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values1, 'bo', label='Basic loss')
plt.plot(epochs, loss_values2, 'bo', color='green', label='Basic+dropout loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, 'bo', label='Basic loss')
plt.plot(epochs, val_loss_values2, 'bo', color='green', label='Basic+dropout loss')
plt.title('Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

**(1.3) Add layer**

In [None]:
from keras import regularizers
embedding_dim = 64
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64,activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history3 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))

In [None]:
from keras import regularizers
embedding_dim = 128
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(128,kernel_regularizer=regularizers.l1(0.0005), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history3 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))

In [None]:
#history_dict = history3.history
#val_acc_values = history_dict['val_accuracy']
#val_acc_values

In [None]:
history_dict = history3.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
history_dict = history1.history
loss_values1 = history_dict['loss']
val_loss_values1 = history_dict['val_loss']

history_dict = history3.history
loss_values2 = history_dict['loss']
val_loss_values2 = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values1, 'bo', label='Basic loss')
plt.plot(epochs, loss_values2, 'bo', color='green', label='Two layers loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, 'bo', label='Basic loss')
plt.plot(epochs, val_loss_values2, 'bo', color='green', label='Two layers loss')
plt.title('Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
history_dict = history2.history

y_pred=model.predict(x_test)

y_pred.map(lambda x: 0 if x<0.5 else 1, seq_predictions)

In [None]:
predictions = model.predict(x_test)
seq_predictions=np.transpose(predictions)[0]
seq_predictions = list(map(lambda x: 0 if x<0.5 else 1, seq_predictions))

print(precision_score(y_test, seq_predictions))
print(recall_score(y_test, seq_predictions))
print(f1_score(y_test, seq_predictions))

In [None]:
history_dict = history2.history
loss_values1 = history_dict['loss']
val_loss_values1 = history_dict['val_loss']

history_dict = history3.history
loss_values2 = history_dict['loss']
val_loss_values2 = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values1, 'bo', label='Basic with Regularization loss')
plt.plot(epochs, loss_values2, 'bo', color='green', label='Two layers loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, 'bo', label='Basic with Regularization  loss')
plt.plot(epochs, val_loss_values2, 'bo', color='green', label='Two layers loss')
plt.title('Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
history_dict = history2.history
loss_values1 = history_dict['accuracy']
val_loss_values1 = history_dict['val_accuracy']

history_dict = history3.history
loss_values2 = history_dict['accuracy']
val_loss_values2 = history_dict['val_accuracy']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values1, 'bo', label='Basic with Regularization')
plt.plot(epochs, loss_values2, 'bo', color='green', label='Two layers')
plt.title('Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, val_loss_values1, 'bo', label='Basic with Regularization')
plt.plot(epochs, val_loss_values2, 'bo', color='green', label='Two layers')
plt.title('Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
y_pred1 = model.predict(x_test)
y_pred = np.argmax(y_pred1, axis=1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay
cm = (y_test, y_pred)
#print(classification_report(y_test, y_pred))
print(cm)


In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
...                               display_labels=clf.classes_)
>>> disp.plot()
<...>
>>> plt.show()

In [None]:
y_pred1

In [None]:
# Print f1, precision, and recall scores
print(precision_score(y_test, y_pred1))
print(recall_score(y_test, y_pred1))
print(f1_score(y_test, y_pred1))

**(2) Conv1D**

In [None]:
max_features

In [None]:
from keras.models import Sequential
from keras import layers
from keras import optimizers

model = Sequential()
model.add(Embedding(max_words, 32, input_length=maxlen))
model.add(layers.Conv1D(16, 7,activation='relu'))
#model.add(Dropout(0.5))
#model.add(layers.MaxPooling1D(5))
#model.add(layers.Conv1D(16, 7,activation='relu'))
#model.add(Dropout(0.5))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))
model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history4 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))

In [None]:
history_dict = history4.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(layers.Conv1D(32, 7,activation='relu'))
model.add(Dropout(0.5))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7,activation='relu'))
#model.add(Dropout(0.5))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history4 = model.fit(x_train,
                    y_train,
                    epochs=20,
                    validation_data=(x_test,y_test))

In [None]:
history_dict = history4.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

**Evaluation Metric (F1)**

In [None]:
##Define evaluation metric
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [None]:
#########################

**Ignore from here down**

In [None]:
#########################

**Model Building**

In [None]:
max_seq_length=0
for i in tqdm(range(data_train.shape[0])):
    max_seq_length=max(len(data_train.iloc[i,1].split()),max_seq_length)
max_seq_length=max_seq_length+2     
print(max_seq_length)

# output : 
# 960

In [None]:
def convert_examples_to_features(sentences, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for index in tqdm_notebook(range(len(sentences)),desc="Converting examples to features"):
        sentence = sentences[index] #example.text_a.split(' ')
        if sentence=='':
            sentence=" "
        input_id = tokenizer.encode_plus(sentence,max_length=max_seq_length,pad_to_max_length=True)['input_ids']
        input_mask = tokenizer.encode_plus(sentence,max_length=max_seq_length,pad_to_max_length=True)['attention_mask']
        segment_id = tokenizer.encode_plus(sentence,max_length=max_seq_length,pad_to_max_length=True)['token_type_ids']
        label = label_list[index]
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels)
    )

In [None]:
# tokenizer=RobertaTokenizer.from_pretrained('roberta-base')
# #tokenizer=AlbertTokenizer.from_pretrained("albert-base-v2")

# data_train=data_train.reset_index(drop=True)
# #data_val=data_val.reset_index(drop=True)

# (train_input_ids, train_input_masks, train_segment_ids, train_labels 
# ) = convert_examples_to_features(data_train['paragraph_cleaned'],data_train['response'],max_seq_length,tokenizer)

# # (val_input_ids, val_input_masks, val_segment_ids, val_labels 
# # ) = convert_examples_to_features(data_val['text'],data_val['class'],max_seq_length,tokenizer)

# (test_input_ids, test_input_masks, test_segment_ids, test_labels 
# ) = convert_examples_to_features(data_test['paragraph_cleaned'],np.ones(data_test.shape[0]),max_seq_length,tokenizer)