# **Notebook C**: Patent Classification with CNN
----



# C.1. Load Packages
---

In [5]:
# General Packages #
import os
import pandas as pd
import numpy as np

# Load TQDM to Show Progress Bars #
from tqdm.notebook import tqdm as tqdm_notebook

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix

# Keras Packages #

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Dropout, Activation, GlobalMaxPooling1D, LSTM, Bidirectional, BatchNormalization

In [6]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [7]:
# Mount Personal Google Drive on own Machine -- You have to follow the link to log in #
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# C.2. Load Training Data ##
----------------

We are going to use the data on the Google drive. This is in a csv file, and so we are going to load the data as a dataframe, and then convert the main data (Text Ids, Indicator for Tech / Non-Tech, Text Abstract) from a Pandas DataFrame to a list (which is more easily used in later sections).

In [8]:
# Change to Working Directory with Training Data #
os.chdir("/content/drive/MyDrive/Power-data-main/")

# Load Training Data #
file_path = 'Training Data/power_data.tsv'

# Read the TSV file
df = pd.read_csv(file_path, sep='\t')

# Extract the required columns: id, cleaned_abstract, and actual
IDs = np.array(df['id'].values.tolist())
Abstract_Text = df['abstract'].values.tolist()
Classes = df['actual'].values.tolist()

In [9]:
import jieba

def text_cleaner(text):
    words = jieba.lcut(text)
    return ' '.join(words)

# Clean the texts
cleaned_texts = [text_cleaner(text) for text in Abstract_Text]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.688 seconds.
DEBUG:jieba:Loading model cost 0.688 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [10]:
# Tokenizer

n_words = 2000

# Define Tokenizer Function #

tokenizer = Tokenizer(num_words=n_words,
                      lower = True,
                      filters='',  # 不需要过滤任何符号
                      char_level=False)
tokenizer.fit_on_texts(cleaned_texts)
sequences = tokenizer.texts_to_sequences(cleaned_texts)
vocab_size = len(tokenizer.word_index) + 1  # 计算词汇表大小

Once we have the list of words that occur in our corpus of abstracts (i.e. word index), then we can try to map those words to embedding vectors. Below we define the functions that will go through each of the words in our word index and extract the coresponding embedding vector and save it to an embedding matrix that will be used as a layer in a subsequent convolutional neural network (CNN) model.

In [11]:
# Create Embedding Matrix by Loading Embedding File and Mapping it to Word Index #
maxlen = 200
X = pad_sequences(sequences, padding='post', maxlen=maxlen)
y = np.array(Classes)

# Function to create embedding matrix
def create_embedding_matrix(filepath, word_index, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [12]:
# Define Different Models with Embeddings #

CLASSIFIERS = [
               ['No Embeddings', 'NONE', 50],
               ['Chinese Word2Vec', './Embeddings/cn.word2vec.300.vec.iter5', 300],
               ['FastText', './Embeddings/FastText.cc.zh.300.vec', 300],
               ['Power data Word2Vec','./Embeddings/Powerdata.word2vec.300.vec.txt', 300]
               ]

In [13]:
# Parameters
batch_size = 50
epochs = 20
NUM_OF_SPLITS = 5

# Results storage
RESULTS = []
Classified_Values =[]
Classified_Values_p =[]

for name, path, embedding_dim in tqdm_notebook(CLASSIFIERS, desc='Loop Through Embeddings'):

    # Load Embedding Matrix
    if path != "NONE":
        embedding_matrix = create_embedding_matrix(path, tokenizer.word_index, embedding_dim)
    else:
        embedding_matrix = np.random.random((vocab_size, embedding_dim))

    y_actual, y_predicted, id_s = [], [], []

    for train, test in tqdm_notebook(StratifiedKFold(n_splits=NUM_OF_SPLITS, shuffle=True).split(X, y), desc='Cross-Validating'):
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]

        # Define the model
        model = Sequential()
        if path == "NONE":
            model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, trainable=True))
        else:
            model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False))

        model.add(Dropout(0.2))
        model.add(layers.Conv1D(64, 2, activation='relu'))
        model.add(layers.MaxPooling1D(4))
        model.add(layers.LSTM(100))
        model.add(BatchNormalization())
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

        # Train the model
        model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=False)

        y_pred_p = model.predict(X_test)
        y_pred = (y_pred_p > 0.5).astype(int)

        # Store results
        y_actual += list(y_test)
        y_predicted += list(y_pred)
        id_s += list(np.array(IDs)[test])

    # Compuate the Share of Tech Texts #
    Share = np.round(np.mean(y_predicted), 3)
    # Evaluate the model
    Accuracy = accuracy_score(y_actual, y_predicted)
    ROC = roc_auc_score(y_actual, y_predicted)
    Precision = precision_score(y_actual, y_predicted)
    Recall = recall_score(y_actual, y_predicted)
    F1 = f1_score(y_actual, y_predicted)
    CM = confusion_matrix(y_actual, y_predicted)

    FN = np.round(CM[0][0] / (CM[0][0] + CM[1][0]), 3)
    FP = np.round(CM[0][1] / (CM[0][1] + CM[1][1]), 3)
    TN = np.round(CM[1][0] / (CM[0][0] + CM[1][0]), 3)
    TP = np.round(CM[1][1] / (CM[0][1] + CM[1][1]), 3)

    # Add Classification Performance Metrics to List#
    RESULTS.append([name, Share, TP, FN, FP, TN,
                                          np.round(Accuracy, 3),
                                          np.round(ROC, 3),
                                          np.round(Precision, 3),
                                          np.round(Recall, 3),
                                          np.round(F1, 3)])

    # Store classification results
    Classified_Values.append(list(zip(len(id_s)*[name], id_s, y_actual, y_predicted)))
##%%
# Convert List to Dataframe #
RESULTS_TABLE = pd.DataFrame(RESULTS, columns = ["Name", "Share", "True-Positives",
                                                 "False-Negatives", "False-Positives",
                                                 "True-Negatives","Accuracy", "AUC",
                                                 "Precision", "Recall", "F1"] )

RESULTS_TABLE["Type"] = "CNN"
RESULTS_TABLE = RESULTS_TABLE[["Name", "Type", "Share", "True-Positives",
                               "False-Negatives", "False-Positives",
                               "True-Negatives","Accuracy", "AUC",
                               "Precision", "Recall", "F1"]]

# Output Results #
RESULTS_TABLE.sort_values("Accuracy", ascending = False ).to_csv("./Output/Model Performance/CNN Model Classification Performance.csv")

# Display Results -- Out of Sample (Holdout) prediction -- Sorted by Accuracy #
RESULTS_TABLE.sort_values("Accuracy", ascending = False )


Loop Through Embeddings:   0%|          | 0/4 [00:00<?, ?it/s]

Cross-Validating: 0it [00:00, ?it/s]

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


Cross-Validating: 0it [00:00, ?it/s]

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


Cross-Validating: 0it [00:00, ?it/s]

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


Cross-Validating: 0it [00:00, ?it/s]

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


Unnamed: 0,Name,Type,Share,True-Positives,False-Negatives,False-Positives,True-Negatives,Accuracy,AUC,Precision,Recall,F1
3,Power data Word2Vec,CNN,0.46,0.861,0.94,0.139,0.06,0.904,0.906,0.861,0.925,0.892
1,Chinese Word2Vec,CNN,0.435,0.871,0.911,0.129,0.089,0.894,0.892,0.871,0.883,0.877
2,FastText,CNN,0.418,0.886,0.899,0.114,0.101,0.894,0.89,0.886,0.864,0.875
0,No Embeddings,CNN,0.482,0.788,0.905,0.212,0.095,0.849,0.854,0.788,0.886,0.834


In [14]:
# Convert List to Dataframe #
RESULTS_TABLE = pd.DataFrame(RESULTS, columns = ["Name", "Share", "True-Positives",
                                                 "False-Negatives", "False-Positives",
                                                 "True-Negatives","Accuracy", "AUC",
                                                 "Precision", "Recall", "F1"] )

RESULTS_TABLE["Type"] = "CNN"
RESULTS_TABLE = RESULTS_TABLE[["Name", "Type", "Share", "True-Positives",
                               "False-Negatives", "False-Positives",
                               "True-Negatives","Accuracy", "AUC",
                               "Precision", "Recall", "F1"]]

# Output Results #
RESULTS_TABLE.sort_values("Accuracy", ascending = False ).to_csv("./Output/Model Performance/CNN Model Classification Performance.csv")

# Display Results -- Out of Sample (Holdout) prediction -- Sorted by Accuracy #
RESULTS_TABLE.sort_values("Accuracy", ascending = False )

Unnamed: 0,Name,Type,Share,True-Positives,False-Negatives,False-Positives,True-Negatives,Accuracy,AUC,Precision,Recall,F1
3,Power data Word2Vec,CNN,0.46,0.861,0.94,0.139,0.06,0.904,0.906,0.861,0.925,0.892
1,Chinese Word2Vec,CNN,0.435,0.871,0.911,0.129,0.089,0.894,0.892,0.871,0.883,0.877
2,FastText,CNN,0.418,0.886,0.899,0.114,0.101,0.894,0.89,0.886,0.864,0.875
0,No Embeddings,CNN,0.482,0.788,0.905,0.212,0.095,0.849,0.854,0.788,0.886,0.834


In [None]:
# Output Classification Results for Training Dataset -- PREDICTED VALUES -- Out Of Sample (Holdout) Prediction #

for i in range(0,len(Classified_Values), 1):

  Temp = pd.DataFrame(  Classified_Values[i],
                        columns = ['Model', 'id', 'Actual', 'Predicted'] )

  if i == 0:
    name = Temp.head(1)['Model'][0]
    Temp = Temp[['id', 'Actual', 'Predicted']]
    Temp.columns = ['id', 'Actual', name]
    Final = Temp

  else:

    name = Temp.head(1)['Model'][0]
    Temp = Temp[['id', 'Predicted']]
    Temp.columns = ['id', name]

    Final = Final.merge(Temp, on = ['id'])

# Save Data Frame #
Final.to_csv("./Output/Classification Output/CNN Classification Results.csv")