In [15]:
#IMPORTS
import os
import re
import nltk
import json
import time
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import collections, functools, operator
from tensorflow import keras
from datasets import Dataset
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizer, BigBirdTokenizer, DistilBertTokenizer, TFBertModel, BigBirdForSequenceClassification, TFDistilBertModel, \
DistilBertForSequenceClassification, BertConfig, DistilBertConfig, pipeline, Trainer, TrainingArguments, EvalPrediction, AutoTokenizer

In [2]:
#TEXT CORPUS CREATION
COMM_DIRECTORY = '/kaggle/input/licenses/Comm'
NONCOMM_DIRECTORY = '/kaggle/input/licenses/NonC'

stop_words = set(stopwords.words('english'))

def tokkenizer(directory):
    text_corpus = ''
    sentences = []
    for file in os.listdir(directory):
        with open(os.path.join(directory, file)) as json_file:
            json_corpus = json.load(json_file)
            temp_corpus = json_corpus['licenseText']
            filt = r"[\n\-\=\\\/\t_`~¤•#\xa0–—]"
            temp_corpus = re.sub(filt, ' ', temp_corpus)
            temp_corpus = re.sub(r" +", ' ', temp_corpus)
            sentences.append(temp_corpus)
            text_corpus += temp_corpus

    token_text = word_tokenize(text_corpus)      
    token_text_stop = [w for w in token_text if not w.lower() in stop_words]
    return token_text, token_text_stop, sentences, text_corpus

comm_tokens, comm_tokens_stop, comm_sentences, comm_corpus = tokkenizer(COMM_DIRECTORY)
noncomm_tokens, noncomm_tokens_stop, noncomm_senteces, noncomm_corpus = tokkenizer(NONCOMM_DIRECTORY)

In [None]:
#SENTIMENT ANALYSIS
def sentiment_analizer(corpus, text):
    sia = SentimentIntensityAnalyzer()
    corpus_sent = corpus.split('. ')
    corpus_sent_filt = [sent for sent in corpus_sent if len(sent.split()) > 3]
    scores = list(map(lambda x: sia.polarity_scores(x), corpus_sent_filt))
    result = dict(functools.reduce(operator.add, map(collections.Counter, scores)))
    result = {key: value / len(scores) for key, value in result.items()}
    print(text)
    print(result)

sentiment_analizer(comm_corpus, 'COMMERCIONAL')
sentiment_analizer(noncomm_corpus, 'NONCOMMERCIONAL')

In [None]:
#CREATE WORD CLOUD
def create_word_cloud(text):
    comm_wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white', stopwords={''},
                    min_font_size = 10).generate(' '.join(text))

    # plot the WordCloud image                      
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(comm_wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

create_word_cloud(comm_tokens)
create_word_cloud(noncomm_tokens)

In [None]:
stop_words_modi = {'the', ',', 'of', '.', '-', 'to', 'this', 'in', 'that', 'a', '(', ')'}

In [None]:
#MOST COMMON WORDS AND DIFFERENCES BETWEEN THEM
comm_most_common = nltk.FreqDist(w.lower() for w in comm_tokens)
noncomm_most_common = nltk.FreqDist(w.lower() for w in noncomm_tokens)
comm_top = comm_most_common.most_common(100)
noncomm_top = noncomm_most_common.most_common(100)

comm_only_words = list(map(lambda x: re.sub("[0-9(),' \"]",'' ,str(x)), comm_top))
noncomm_only_words = list(map(lambda x: re.sub("[0-9(),' \"]",'' ,str(x)), noncomm_top))

diff = list(set(comm_only_words) - set(noncomm_only_words))
diff_comparision = [[w, comm_most_common[w]/len(comm_most_common), noncomm_most_common[w]/len(noncomm_most_common)] for w in diff]

print(diff)
print()
print(diff_comparision)

In [3]:
#CREATING DATAFRAME
comm_df = pd.DataFrame()
comm_df['text'] = comm_sentences
comm_df['label'] = 1

noncomm_df = pd.DataFrame()
noncomm_df['text'] = noncomm_senteces
noncomm_df['label'] = 0

data = pd.concat([comm_df, noncomm_df])
data.reset_index(inplace=True)
data = shuffle(data)
#print(data.head(10))

In [None]:
#INITIALIZE DISTILBERT MODEL TENSORFLOW
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')



#MODEL CREATION
def create_model():
    inpt = Input(shape=(max_len,), dtype='int64')
    masks = Input(shape=(max_len,), dtype='int64')
    
    dbert_layer = dbert_model(inpt, attention_mask=masks)[0][:,0,:]
    dense = Dense(512, activation='relu')(dbert_layer)
    dropout = Dropout(0.5)(dense)
    pred = Dense(2, activation='sigmoid')(dropout) #sigmoid/softmax
    #concat_layer= Concatenate()([dense_bert, inpt2])
    model = tf.keras.Model(inputs=[inpt, masks], outputs=[pred, dropout, dbert_layer])
    
    #print(model.summary())
    return model

model=create_model()



#CREATING INPUT DATA
sia = SentimentIntensityAnalyzer()
input_ids=[]
attention_masks=[]
input_semantic=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True, truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])
    input_semantic.append(list(sia.polarity_scores(sent).values()))

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
input_sent = np.array(input_semantic)
labels=np.array(labels)


#DATA SPLIT
train_input, test_input, train_label, test_label, train_mask, test_mask = train_test_split(input_ids, labels, attention_masks, test_size=0.2)

#log_dir='dbert_model'
#model_save_path='./dbert_model.h5'

#callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]


#MODEL CONFUGIRATION
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

history = model.fit([train_input, train_mask], train_label, batch_size=16, epochs=20, validation_data=([test_input, test_mask], test_label))

In [4]:
#MODEL DATA
sentences = data['text']
labels = data['label']

In [None]:
#LOADING PYTORCH DISTILBERT MODEL --- IN PROGRESS
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

tokens = list(map(lambda x: tokenizer(x, return_tensors="pt", truncation=False, max_length=None), sentences)) #MAX 128 tokens

In [5]:
#LOADING BIGBIRD MODEL
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base',
                gradient_checkpointing=False,
                num_labels = 2,
                cache_dir='/media/data_files/github/website_tutorials/data',
                return_dict=True)

tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [6]:
#CHOOSING CORRECT TOKENS LENGHT
tokens_lenght = list(map(lambda x: len(tokenizer.tokenize(x)), sentences))
percentil_50 = int(np.percentile(tokens_lenght, 50))
percentil_75 = int(np.percentile(tokens_lenght, 75))
max_len = percentil_75

In [13]:
#DATA SPLIT
train_data_sent, test_data_sent, train_label_sent, test_label_sent = train_test_split(sentences, labels, test_size=0.2)

In [16]:
#CREATING DATASET
df_train = pd.DataFrame({'text': train_data_sent, 'labels': train_label_sent})
df_train.reset_index(drop=True, inplace=True)
train_dataset = Dataset.from_pandas(df_train)

df_test = pd.DataFrame({'text': test_data_sent, 'labels': test_label_sent})
df_test.reset_index(drop=True, inplace=True)
test_dataset = Dataset.from_pandas(df_test)

In [17]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = max_len)

train_data = train_dataset.map(tokenization, batched = True, batch_size = len(train_dataset))
test_data = test_dataset.map(tokenization, batched = True, batch_size = len(train_dataset))

# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = '/kaggle/working/',
    num_train_epochs = 4,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    disable_tqdm = False,
    warmup_steps=160,
    weight_decay=0.01,
    learning_rate = 1e-5,
    fp16 = True,
    dataloader_num_workers = 0
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 15.90 GiB total capacity; 430.46 MiB already allocated; 37.75 MiB free; 462.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()