In [1]:
import random
import numpy as np
import pandas as pd

random.seed(3311791)
np.random.seed(3311791)

### Loading Data and preprocessing

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/pilot_s.csv
/kaggle/input/english_li_ion_patents.csv


In [4]:
file_path = '/kaggle/input/english_li_ion_patents.csv'

data = pd.read_csv(file_path)

In [5]:
df = data.copy(deep=True)
pattern = r'(?s)\[EN\](.*?)(?:\[[A-Z]{2}\]|$)' # anything between [EN] and ([any capital two letter combination] or end of string)

df['Claim_EN'] = df['Claims'].str.extract(pattern)
df['Abstract_EN'] = df['Abstract'].str.extract(pattern)
df['Title_EN'] = df['Title'].str.extract(pattern)

df['Application.Date'] = pd.to_datetime(df['Application.Date'])
df['Year'] = df['Application.Date'].dt.year

In [6]:
df['Text'] = df['Title_EN'].str.cat(df['Abstract_EN'], sep=' ').str.cat(df['Claim_EN'], sep=' ')

In [7]:
# Split the 'Cooperative.Patent.Class' column into lists
df['CPC'] = df['Cooperative.Patent.Class'].str.split('\r\n')


In [8]:
df['CPC_short'] = df['CPC'].apply(lambda x: [i[:1] for i in x] if isinstance(x, list) else x)
df['CPC_short']

0                        [H, H, H, H, H, H, H, H, H, Y, Y]
1                        [H, H, H, H, H, H, H, H, H, Y, Y]
2        [H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, ...
3        [H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, ...
4                        [H, H, H, H, H, H, H, H, H, Y, Y]
                               ...                        
57047                          [H, H, H, H, H, H, H, Y, Y]
57048                          [H, H, H, H, H, H, H, Y, Y]
57049                          [H, H, H, H, H, Y, Y, Y, Y]
57050                          [H, H, H, H, H, H, Y, Y, Y]
57051                       [H, H, H, H, H, H, Y, Y, Y, Y]
Name: CPC_short, Length: 57052, dtype: object

In [9]:
df['CPC_unique'] = df['CPC_short'].apply(lambda x: list(set(x)) if isinstance(x, list) else x)
df.copy().explode('CPC_unique').shape

(138549, 23)

All patents have or should have H (Electricity class) in their CPC codes, as this is what was filtered for in the patent data search.

In [10]:
df['CPC_unique']

0        [Y, H]
1        [Y, H]
2        [Y, H]
3        [Y, H]
4        [Y, H]
          ...  
57047    [Y, H]
57048    [Y, H]
57049    [Y, H]
57050    [Y, H]
57051    [Y, H]
Name: CPC_unique, Length: 57052, dtype: object

In [11]:
one_hot_encoded = pd.get_dummies(df['CPC_unique'].apply(pd.Series).stack()).groupby(level=0).sum()

df = pd.concat([df, one_hot_encoded], axis=1)
df

Unnamed: 0.1,Unnamed: 0,Application.Date,Country.Code,Probable.Patent.Assignee,Family.Number,Title,Priority.Dates,Abstract,Claims,Number.of.Forward.Citations,...,CPC_unique,A,B,C,D,E,F,G,H,Y
0,1,2014-02-06,AU,ENCELL TECHNOLOGY INC,57136128,[EN] BATTERY COMPRISING A COATED IRON ANODE AN...,2013-02-06\r\n2014-01-14\r\n2014-01-14\r\n2014...,[EN] The present invention provides one with a...,[EN] What is claimed is: 1. A battery comprisi...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2,2014-02-06,AU,ENCELL TECHNOLOGY INC,57136128,[EN] BATTERY COMPRISING A COATED IRON ANODE AN...,2013-02-06\r\n2014-01-14\r\n2014-01-14\r\n2014...,[EN] The present invention provides one with a...,[EN] - 11 The Claims Defining the Invention ar...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,3,2014-02-06,AU,ENCELL TECHNOLOGY INC,57136128,[EN] PROCESS FOR FORMING A BATTERY CONTAINING ...,2013-02-06\r\n2013-11-08\r\n2014-02-06,[EN] Provided is a process for activating a ba...,[EN] What is claimed is: 1. A process for acti...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,4,2014-02-06,AU,ENCELL TECHNOLOGY INC,57136128,[EN] PROCESS FOR FORMING A BATTERY CONTAINING ...,2013-02-06\r\n2013-11-08\r\n2014-02-06,[EN] Provided is a process for activating a ba...,[EN] - 11 The Claims Defining The Invention Ar...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,5,2014-02-06,CA,ENCELL TECHNOLOGY INC,57136128,[EN] BATTERY COMPRISING A COATED IRON ANODE AN...,2013-02-06\r\n2014-01-14\r\n2014-01-14\r\n2014...,[EN] Iron electrodes have been used in energy ...,[EN] What is claimed is:1. A battery comprisin...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57047,226103,2020-08-18,EP,FUJIFILM HOLDINGS CORP,83545109,[EN] METHOD FOR PRODUCING MOLDED BODY FOR ELEC...,2019-08-19\r\n2020-08-18,[EN] A method of manufacturing a formed body f...,[EN] 1. A method of manufacturing a formed bod...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
57048,226107,2021-12-28,US,FUJIFILM HOLDINGS CORP,83545109,[EN] METHOD OF MANUFACTURING FORMED BODY FOR E...,2019-08-19\r\n2020-08-18\r\n2021-12-28,[EN] A method of manufacturing a formed body f...,[EN] 1. A method of manufacturing a formed bod...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
57049,226114,2017-01-11,US,FUJIFILM HOLDINGS CORP,61604174,"[EN] ALL SOLID-STATE SECONDARY BATTERY, INORGA...",2014-07-31\r\n2015-07-29\r\n2017-01-11,[EN] Provided are an all solid-state secondary...,[EN] 1. An all solid-state secondary battery c...,0,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
57050,226115,2017-01-11,US,FUJIFILM HOLDINGS CORP,61604174,"[EN] ALL SOLID-STATE SECONDARY BATTERY, INORGA...",2014-07-31\r\n2015-07-29,[EN] Provided are an all solid-state secondary...,[EN] 1. An all solid-state secondary battery c...,12,...,"[Y, H]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [12]:
!pip install miceforest


Collecting miceforest
  Downloading miceforest-5.7.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting blosc (from miceforest)
  Downloading blosc-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading miceforest-5.7.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.2/58.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading blosc-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: blosc, miceforest
Successfully installed blosc-1.11.1 miceforest-5.7.0


In [13]:
variables_mf = ['Country.Code', 'Probable.Patent.Assignee', 'Family.Number', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'Y'] 

print(df.isna().sum())
df.dropna(subset=['Application.Date'], inplace=True)


Unnamed: 0                        0
Application.Date                  1
Country.Code                      0
Probable.Patent.Assignee          2
Family.Number                     0
Title                             0
Priority.Dates                    5
Abstract                          0
Claims                            0
Number.of.Forward.Citations       0
Cooperative.Patent.Class       2155
Grant                             0
Abstract.Languages                0
Claims.Languages                  0
Title.Languages                   0
Claim_EN                          0
Abstract_EN                       0
Title_EN                          0
Year                              1
Text                              0
CPC                            2155
CPC_short                      2155
CPC_unique                     2155
A                              2155
B                              2155
C                              2155
D                              2155
E                           

In [14]:
df['Probable.Patent.Assignee'].value_counts()

Probable.Patent.Assignee
LG ENERGY SOLUTION LTD                           7358
SAMSUNG SDI CO LTD                               4125
TOYOTA JIDOSHA KK                                1449
ROBERT BOSCH GMBH                                1395
SANYO ELECTRIC CO LTD                             978
                                                 ... 
YANG JERRY                                          1
COVESTRO INTELLECTUAL PROPERTY GMBH AND CO KG       1
ORCA SCIENCES LLC                                   1
L TEC BATTERY GMBH                                  1
SEBITCHEM                                           1
Name: count, Length: 3130, dtype: int64

In [15]:
import numpy as np

counts = df['Probable.Patent.Assignee'].value_counts()
df['Probable.Patent.Assignee'] = df['Probable.Patent.Assignee'].map(lambda x: 'single_patent_in_dataset' if x in counts and counts[x] == 1 and x is not np.nan else x)


In [16]:
columns = ['Country.Code', 'Probable.Patent.Assignee', 'Family.Number']
df_impute = df.copy()
for col in columns:
    counts = df_impute[col].value_counts()
    df_impute[col] = df_impute[col].map(lambda x: 'rare_occurrence' if pd.notnull(x) and counts[x] <= 100 else x)
    df_impute[col] = df_impute[col].map(lambda x: str(x) if pd.notnull(x) else x)


In [17]:
import miceforest as mf
variables_mf = ['Country.Code', 'Probable.Patent.Assignee', 'Family.Number', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'Y'] 

df_impute[variables_mf] = df_impute[variables_mf].astype('category')

kernel = mf.ImputationKernel(
  data=df_impute[variables_mf],
  save_all_iterations=True,
  random_state=0
)
kernel.mice(5, verbose=True, min_data_in_leaf=20, num_iterations=5)
df_impute[variables_mf] = kernel.complete_data()

df.loc[df['Probable.Patent.Assignee'].isna(), 'Probable.Patent.Assignee'] = df_impute.loc[df['Probable.Patent.Assignee'].isna(), 'Probable.Patent.Assignee']
for col in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Y']:
    df[col] = df_impute[col]

  warn(


Initialized logger with name mice 1-5
Dataset 0
1  | Probable.Patent.Assignee | A | B | C | D | E | F | G | Y
2  | Probable.Patent.Assignee | A | B | C | D | E | F | G | Y
3  | Probable.Patent.Assignee | A | B | C | D | E | F | G | Y
4  | Probable.Patent.Assignee | A | B | C | D | E | F | G | Y
5  | Probable.Patent.Assignee | A | B | C | D | E | F | G | Y


In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import words
import re
nltk.download('stopwords')
nltk.download('punkt')
import spacy
nltk.download('words')

english_words = set(words.words())
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Lemmatize, remove stopwords and punctuation

In [19]:
import multiprocessing as mp
from multiprocessing import Pool

custom_stopwords = {'lithium', 'ion', 'battery', 'cell', 'said', 'wherein', 'herein', 'one', 'claim', 'claims', 'comprise', 'comprising', 'say', 'first', 'second', 'third'}
stop_words = set(stopwords.words('english')).union(custom_stopwords)

def preprocess(text):
    doc = nlp(text)
    text = " ".join([token.lemma_.lower() for token in doc if token.is_alpha and token.pos_ in ['NOUN', 'PROPN'] and token.lemma_.lower() not in stop_words]) # lemmatize if alphanumeric and is a noun or proper noun and not in stopwords
    return [word for word in text.split() if len(word) >= 2 and word in english_words]

def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def apply_preprocess(df):
    df['Text_Lemma'] = df['Text'].apply(preprocess)
    return df

df = parallelize_dataframe(df, apply_preprocess)
df['Text_Lemma']
df['Text_Lemma_unlist'] = df['Text_Lemma'].apply(' '.join)
df.to_csv('df_lemma.csv', index=False)


  return bound(*args, **kwds)


In [20]:
df['Text_Lemma']

0        [iron, anode, performance, invention, iron, an...
1        [iron, anode, performance, invention, iron, an...
2        [process, iron, process, iron, electrode, proc...
3        [process, iron, process, iron, electrode, proc...
4        [iron, anode, performance, iron, electrode, en...
                               ...                        
57047    [method, body, method, body, electrode, step, ...
57048    [method, formed, body, method, body, electrode...
57049    [solid, state, secondary, inorganic, solid, el...
57050    [solid, state, secondary, inorganic, solid, el...
57051    [solid, state, secondary, inorganic, solid, el...
Name: Text_Lemma, Length: 57051, dtype: object

In [21]:
df['Text_Lemma_unlist'] = df['Text_Lemma'].apply(' '.join)

In [22]:
df.to_csv('df_lemma_english_words.csv', index=False)


In [24]:
import pandas as pd

df = pd.read_csv('/kaggle/working/df_lemma.csv')

df = df.drop(['Priority.Dates', 'Unnamed: 0', 'Abstract', 'Claims', 'Cooperative.Patent.Class', 'Abstract.Languages', 'Claims.Languages', 'Title.Languages', 'Claim_EN',
       'Abstract_EN', 'Title_EN', 'CPC', 'CPC_short',
       'CPC_unique'], axis=1)

df.to_csv('/kaggle/working/df_lemma_dropped.csv', index=False)
