# Classification des brevets anglais

In [None]:
import pandas as pd
df_copie = pd.read_csv("/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/data_clean/35_en_join_publications.csv")
s = df_copie["classification"].copy()

df_copie["classification"] = [st.replace("'", "").strip('][').split(', ') for st in s]

In [None]:
import xml.etree.ElementTree as ET
def clear_tags(text_xml):
  tree = ET.fromstring("<root>" + text_xml + "</root>")
  for p in tree.findall("p"):
    for child in p:
      if child.tag == "patcit" or child.tag == "figref" or child.tag == "ul":
        p.remove(child)
  for h in tree.findall("heading"):
    tree.remove(h)
  return(ET.tostring(tree, encoding="utf-8", method="text").decode("utf-8"))

In [None]:
df_copie["description"] = [clear_tags(t) for t in df_copie["description"].values]

In [None]:
#df_copie["claim"].values[0]
from bs4 import BeautifulSoup
df_copie["claim"] = [BeautifulSoup(abstr).get_text() for abstr in df_copie['claim']]

In [None]:
df_copie['title'] = df_copie['title'].map(lambda x: x.rstrip('\n'))
df_copie['abstr'] = [BeautifulSoup(abstr).get_text() for abstr in df_copie['abstr']]

In [None]:
df_copie["classification"] = [list(set([val[:4] for val in liste])) for liste in df_copie.classification.values]

In [None]:
df_labels_sc = pd.get_dummies(df_copie.classification.apply(pd.Series).stack(), prefix="classification").sum(level=0)

In [None]:
df_labels = df_copie.copy()
del df_labels["kind"]
del df_labels["date"]
del df_labels["information"]
del df_labels["classification"]
del df_labels["claim"]
del df_labels["description"]
del df_labels["title"]
del df_labels["abstr"]
df_labels.set_index("number", inplace=True)

In [None]:
import plotly.graph_objects as go
Xlist = df_labels_sc.sum(axis=0).index.values
Ylist = df_labels_sc.sum(axis=0).values

inds = Ylist.argsort()[::-1]
sortedX = Xlist[inds][:50]
sortedY = Ylist[inds][:50]

fig = go.Figure(
    data=[go.Scatter(x=sortedX,
    y=sortedY)],
    layout=go.Layout(
        xaxis=dict(showgrid=False, type='category'),
        yaxis=dict(showgrid=False),
    )
)

fig.show()

In [None]:
df_classification_h = df_copie.join(df_labels_sc).dropna().copy()
del df_classification_h["kind"]
del df_classification_h["date"]
del df_classification_h["information"]
del df_classification_h["classification"]
del df_classification_h["abstr"]

## Utilisation d'un modèle avec LSTM

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

In [None]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [None]:
X = []
sentences = list(df_classification_h["title"])
for sen in sentences:
    X.append(preprocess_text(sen))

y = df_labels_sc.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(22, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model.summary())

In [None]:
from keras.utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

## Utilisation de Bert pour le second modèle

In [None]:
!pip install transformers

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
new_df = df_classification_h[["title"]].copy()
new_df["list"] = list(df_labels_sc.values)
new_df.columns = ["comment_text", "list"]
new_df

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=False)

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
dimensions = len(new_df["list"].values[0])

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, dimensions)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(0)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
f1_score_weighted = metrics.f1_score(targets, outputs, average='weighted')
print(f"F1 Score (Weighted) = {f1_score_weighted}")
f1_score_samples = metrics.f1_score(targets, outputs, average='samples')
print(f"F1 Score (Samples) = {f1_score_samples}")
f1_score = metrics.f1_score(targets, outputs, average=None)
for category, score in zip(list(df_labels_sc.columns), f1_score):
  print(f"F1 Score ({category}) = {score}")