In [None]:
#IMPORTS
import os
import re
import nltk
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import collections, functools, operator
from tensorflow import keras
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer
from tensorflow.keras.layers import Dense, Dropout, Input
from transformers import BertTokenizer, TFBertModel, BertConfig, TFDistilBertModel, DistilBertTokenizer, DistilBertConfig

In [None]:
#TEXT CORPUS CREATION
COMM_DIRECTORY = '/kaggle/input/licenses/Comm'
NONCOMM_DIRECTORY = '/kaggle/input/licenses/NonC'

stop_words = set(stopwords.words('english'))

def tokkenizer(directory):
    text_corpus = ''
    sentences = []
    for file in os.listdir(directory):
        with open(os.path.join(directory, file)) as json_file:
            json_corpus = json.load(json_file)
            temp_corpus = json_corpus['licenseText']
            filt = r"[\n\-\=\\\/\t_`~¤•#\xa0–—]"
            temp_corpus = re.sub(filt, ' ', temp_corpus)
            temp_corpus = re.sub(r" +", ' ', temp_corpus)
            sentences.append(temp_corpus)
            text_corpus += temp_corpus

    token_text = word_tokenize(text_corpus)      
    token_text_stop = [w for w in token_text if not w.lower() in stop_words]
    return token_text, token_text_stop, sentences, text_corpus

comm_tokens, comm_tokens_stop, comm_sentences, comm_corpus = tokkenizer(COMM_DIRECTORY)
noncomm_tokens, noncomm_tokens_stop, noncomm_senteces, noncomm_corpus = tokkenizer(NONCOMM_DIRECTORY)

In [None]:
#SENTIMENT ANALYSIS
def sentiment_analizer(corpus, text):
    sia = SentimentIntensityAnalyzer()
    comercial_corpus_sent = corpus.split('.')
    scores = list(map(lambda x: sia.polarity_scores(x), comercial_corpus_sent))
    result = dict(functools.reduce(operator.add, map(collections.Counter, scores)))
    result = {key: value / len(scores) for key, value in result.items()}
    print(text)
    print(result)

sentiment_analizer(comm_corpus, 'COMMERCIONAL')
sentiment_analizer(noncomm_corpus, 'NONCOMMERCIONAL')

In [None]:
#CREATE WORD CLOUD
def create_word_cloud(text):
    comm_wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white', stopwords={''},
                    min_font_size = 10).generate(' '.join(text))

    # plot the WordCloud image                      
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(comm_wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

create_word_cloud(comm_tokens)
create_word_cloud(noncomm_tokens)

In [None]:
stop_words_modi = {'the', ',', 'of', '.', '-', 'to', 'this', 'in', 'that', 'a', '(', ')'}

In [None]:
#MOST COMMON WORDS AND DIFFERENCES BETWEEN THEM
comm_most_common = nltk.FreqDist(w.lower() for w in comm_tokens)
noncomm_most_common = nltk.FreqDist(w.lower() for w in noncomm_tokens)
comm_top = comm_most_common.most_common(100)
noncomm_top = noncomm_most_common.most_common(100)

comm_only_words = list(map(lambda x: re.sub("[0-9(),' \"]",'' ,str(x)), comm_top))
noncomm_only_words = list(map(lambda x: re.sub("[0-9(),' \"]",'' ,str(x)), noncomm_top))

diff = list(set(comm_only_words) - set(noncomm_only_words))
diff_comparision = [[w, comm_most_common[w]/len(comm_most_common), noncomm_most_common[w]/len(noncomm_most_common)] for w in diff]

print(diff)
print()
print(diff_comparision)

In [None]:
#CREATING DATAFRAME
comm_df = pd.DataFrame()
comm_df['text'] = comm_sentences
comm_df['label'] = 1

noncomm_df = pd.DataFrame()
noncomm_df['text'] = noncomm_senteces
noncomm_df['label'] = 0

data = pd.concat([comm_df, noncomm_df])
data.reset_index(inplace=True)
data = shuffle(data)
print(data.head(10))

In [None]:
#INITIALIZE DISTILBERT MODEL
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
#MODEL DATA
sentences = data['text']
labels = data['label']

In [None]:
#CHOOSING CORRECT TOKENS LENGHT
tokens_lenght = list(map(lambda x: len(dbert_tokenizer.tokenize(x)), sentences))
percentil_50 = int(np.percentile(tokens_lenght, 50))
percentil_75 = int(np.percentile(tokens_lenght, 75))
max_len = 512 #TODO increse lenght, beyond 512 model is crashing (https://stackoverflow.com/questions/60551906/tensorflow-huggingface-invalid-argument-indices0-624-624-is-not-in-0)

In [None]:
#MODEL CREATION
def create_model():
    inpt = Input(shape=(max_len,), dtype='int64')
    masks = Input(shape=(max_len,), dtype='int64')
    dbert_layer = dbert_model(inpt, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu')(dbert_layer)
    dropout = Dropout(0.5)(dense)
    pred = Dense(2, activation='sigmoid')(dropout) #or softmax
    model = tf.keras.Model(inputs=[inpt, masks], outputs=pred)
    print(model.summary())
    return model

model=create_model()

In [None]:
#CREATING INPUT DATA
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True, truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [None]:
train_input, test_input, train_label, test_label, train_mask, test_mask = train_test_split(input_ids, labels, attention_masks, test_size=0.2)

#log_dir='dbert_model'
#model_save_path='./dbert_model.h5'

#callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

In [None]:
history = model.fit([train_input, train_mask], train_label, batch_size=16, epochs=50, validation_data=([test_input, test_mask], test_label))