In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import missingno as msno
import plotly.graph_objects as go
from datetime import date
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

# topic modeling packages 
import nltk
#nltk.download('stopwords')
import re
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel# spaCy for preprocessing
from gensim import similarities
import spacy# Plotting tools
import pyLDAvis

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)

## 1. Tech Dataset

In [99]:
df_tech = pd.DataFrame(pd.read_csv("Tech_cleaned.csv"))
print("Technology Shape:", df_tech.shape)

Technology Shape: (3909, 86)


### 1.1 Feature Engineering

In [100]:
# Filter to only patent IP_Type
df_tech = df_tech.loc[df_tech['IP_Type']=='Patent'].reset_index()
df_tech.shape

(2227, 87)

In [101]:
# Only keeping major divisions with sufficient number of patents; in case of multiple divisions, search through the
# list of divisions in the order of the following dict
division_mapping = {'PSD':'PSD', 
                    'BSD':'BSD', 
                    'PME':'PME', 
                    'Argonne National Laboratory':'ANL', 
                    'Marine Biological Laboratory':'MBL', 
                    'Booth':'Booth', 
                    'University of Chicago Hospital':'UCH', 
                    'SSD':'SSD', 
                    'Comprehensive Cancer Center':'CCC', 
                    'University of Chicago':'UC',
                    'Toyota Technological Institute':'TTI', 
                    'Humanities':'Humanities', 
                    'Harris':'Harris',
                    'Institute of Politics':'Politics'}


In [102]:
df_tech.loc[df_tech['Division_Department'].isnull()] = 'NA'

In [103]:
df_tech['Primary_Division'] = 'Others'
df_tech.loc[df_tech['Division_Department']=='NA', 'Primary_Division'] = 'NA'

In [104]:
for i in range(len(df_tech)):
    for key in division_mapping:
        if key in df_tech['Division_Department'][i]:
            df_tech['Primary_Division'][i] = division_mapping.get(key)
            break;

In [105]:
df_tech[['Division_Department','Primary_Division']].head(15)

Unnamed: 0,Division_Department,Primary_Division
0,"Physical Sciences Division (PSD), Physics",PSD
1,"Physical Sciences Division (PSD), Physics",PSD
2,"Physical Sciences Division (PSD), Physics",PSD
3,"Physical Sciences Division (PSD), Physics",PSD
4,"Physical Sciences Division (PSD), Physics",PSD
5,"Physical Sciences Division (PSD), Physics",PSD
6,"Biological Sciences Division (BSD), Pediatrics",BSD
7,"Biological Sciences Division (BSD), Comprehens...",BSD
8,"Biological Sciences Division (BSD), Pathology",BSD
9,"Biological Sciences Division (BSD), Comprehens...",BSD


In [106]:
df_tech['Primary_Division'].value_counts()/len(df_tech)*100

BSD           64.077234
PSD           18.679838
NA             9.160305
PME            3.502470
ANL            2.065559
MBL            0.898069
Humanities     0.449035
SSD            0.269421
UCH            0.224517
Booth          0.179614
UC             0.134710
Harris         0.134710
TTI            0.089807
CCC            0.089807
Others         0.044903
Name: Primary_Division, dtype: float64

In [107]:
# Any Divisions with less than 0.5% of total records will be consolidated into 'Other' category 
other = ('Humanities', 'SSD', 'UCH', 'Booth', 'UC', 'Harris', 'TTI', 'CCC', 'Others')

df_tech.loc[df_tech['Primary_Division'].isin(other), 'Primary_Division'] = 'Other'

# Confrim changes, % breakout
df_tech['Primary_Division'].value_counts()/len(df_tech)*100

BSD      64.077234
PSD      18.679838
NA        9.160305
PME       3.502470
ANL       2.065559
Other     1.616524
MBL       0.898069
Name: Primary_Division, dtype: float64

In [108]:
# Ensure all description columns are listed as null and not blank strings (this will impact column merging)
df_tech['Brief_Technology_Description'] = df_tech.Brief_Technology_Description.replace(r'^\s*$', np.nan, regex=True)
df_tech['Assessment_Description'] = df_tech.Assessment_Description.replace(r'^\s*$', np.nan, regex=True)
df_tech['Abstract'] = df_tech.Abstract.replace(r'^\s*$', np.nan, regex=True)

# Some of the descriptions include "See ...". We want to remove these, as they do not provide any insight
df_tech['Brief_Technology_Description'] = df_tech.Brief_Technology_Description.replace(r'^SEE .*', np.nan, regex=True)
df_tech['Brief_Technology_Description'] = df_tech.Brief_Technology_Description.replace(r'^See .*', np.nan, regex=True)

df_tech['Assessment_Description'] = df_tech.Assessment_Description.replace(r'^SEE .*', np.nan, regex=True)
df_tech['Assessment_Description'] = df_tech.Assessment_Description.replace(r'^See .*', np.nan, regex=True)

df_tech['Abstract'] = df_tech.Abstract.replace(r'^SEE .*', np.nan, regex=True)
df_tech['Abstract'] = df_tech.Abstract.replace(r'^See .*', np.nan, regex=True)


In [109]:
# Merge Abstract, Assessment_Description, and Brief_Technology_Description together to populate null values in each
# Brief_Technology_Description has the most information, so we will use this as the base column 
df_tech.Brief_Technology_Description = df_tech.Brief_Technology_Description.fillna(df_tech.Assessment_Description)
df_tech.Brief_Technology_Description = df_tech.Brief_Technology_Description.fillna(df_tech.Abstract)
df_tech.Brief_Technology_Description = df_tech.Brief_Technology_Description.fillna(df_tech.Title)

del df_tech['Assessment_Description']
del df_tech['Abstract']

In [110]:
# Confrirm we are not seeing any common issues in Breif_Technology_Description field (NA will be removed later)
df_tech.Brief_Technology_Description.value_counts().sort_values(ascending=False).nlargest(10)

NA                                                                                    204
B:HB                                                                                   14
Use and/or embodiment of SlipChip.                                                     12
A New Monofunctional Phosphonic Acid Ion Exchange Resin                                 2
Serotonin receptor antagonists as highly selective and fast-acting antidepressants      2
Scripps is lead                                                                         2
Surveillance of Industrial Processes with Correlated Parameters                         2
A compact bellows-driven diamond anvil cell for high-pressure, low-temperature          2
B:SP                                                                                    2
System for Monitoring an Industrial Process and Determining Sensor Status               2
Name: Brief_Technology_Description, dtype: int64

### 1.2 Filtering

In [111]:
#Filter to patents that we are the leading institution
df_tech = df_tech.loc[df_tech['We_are_not_the_lead_institution']=='No']

### Create License_Status column based on Guidance from Polsky Center:

In [112]:
# Generate lists for the will never be licensed, could be licensed, igone, and licensed statuses
licensed = ('Non-Exclusively Licensed', 'Exclusively Licensed', 'Optioned','Seeking Licensees', 'Post Election Hold', 'IP Authorized', 'Pending Title Election Decision')
never_licensed = ('Closed/Inactive', 'Waived Rights to Inventor', 'Awaiting Expiration','Licenses at Potential', 'IIA - Other Party Leads', 'Jointly Owned - UoC Leads', 'Combined with other Tech', 'Jointly Owned - Other Party Leads', 'Awaiting Info from Inventors', 'Negotiating License')

df_tech.loc[df_tech['Status'].isin(licensed), 'License_Status'] = 'license'
df_tech.loc[df_tech['Status'].isin(never_licensed), 'License_Status'] = 'no_license'

df_tech.License_Status.value_counts()

no_license    1360
license        544
Name: License_Status, dtype: int64

### 1.3 Drop Columns

In [113]:
columns_keep = ['Tech ID',
'Title',
'Lead_Inventor',
'Disclosure_Date',
'Division_Department',
'Owners',
'Ability_of_investigator_to_continue_research',
'Ability_to_advance_the_project_outside_the_lab',
'Abstract',
'Assessment_Description',
'Brief_Technology_Description',
'Compelling_nature_of_data',
'Detectability_of_infringement_and_enforceability',
'Development_and_regulatory_path_for_the_product',
'Freedom-to-operate_FTO_issues',
'Historical_cooperation_or_not_of_investigator',
'Identity_of_the_eventual_product',
'Impact_of_patent_on_adoption_of_technology',
'Industrial_startup_co-ownership_of_the_IP',
'Institution',
'Licensing_interest_by_a_specific_company',
'Market_feedback',
'Market_Size',
'Nature_of_improvement_over_existing_art',
'Patentability_questions',
'Risk_cost_sharing_w_other_institution',
'Size_of_Market',
'Stage_of_research',
'License_Status']

In [114]:
df_tech_keep = df_tech[['Tech_ID',
'Title',
'Lead_Inventor',
'Disclosure_Date',
'Division_Department',
'Primary_Division',
'Owners',
'Ability_of_investigator_to_continue_research',
'Ability_to_advance_the_project_outside_the_lab',
'Brief_Technology_Description',
'Compelling_nature_of_data',
'Detectability_of_infringement_and_enforceability',
'Development_and_regulatory_path_for_the_product',
'Freedom-to-operate_FTO_issues',
'Historical_cooperation_or_not_of_investigator',
'Identity_of_the_eventual_product',
'Impact_of_patent_on_adoption_of_technology',
'Industrial_startup_co-ownership_of_the_IP',
'Institution',
'Licensing_interest_by_a_specific_company',
'Market_feedback',
'Market_Size',
'Nature_of_improvement_over_existing_art',
'Patentability_questions',
'Risk_cost_sharing_w_other_institution',
'Size_of_Market',
'Stage_of_research',
'License_Status']]

In [115]:
df_tech_keep.rename(columns = {'Title':'Tech_Title'}, inplace = True)

In [116]:
df_tech_keep.shape

(1904, 28)

In [117]:
tech_missing = df_tech_keep.isnull().sum()

print(tech_missing)

Tech_ID                                                0
Tech_Title                                             0
Lead_Inventor                                          2
Disclosure_Date                                       35
Division_Department                                    0
Primary_Division                                       0
Owners                                              1125
Ability_of_investigator_to_continue_research           1
Ability_to_advance_the_project_outside_the_lab         2
Brief_Technology_Description                           0
Compelling_nature_of_data                              2
Detectability_of_infringement_and_enforceability       2
Development_and_regulatory_path_for_the_product        2
Freedom-to-operate_FTO_issues                          2
Historical_cooperation_or_not_of_investigator          2
Identity_of_the_eventual_product                       2
Impact_of_patent_on_adoption_of_technology             2
Industrial_startup_co-ownership

## 2. Patent Dataset

In [118]:
df_pat = pd.DataFrame(pd.read_csv("patentData_Cleaned.csv"))
print("Patent Shape:", df_pat.shape)

Patent Shape: (6843, 33)


In [119]:
df_pat.columns.to_list()

['Unnamed: 0',
 'Tech_ID',
 'Title',
 'Serial_Number',
 'Patent_Number',
 'Country',
 'Country_WIPO_ID',
 'File_Date',
 'Issue_Date',
 'Publication_Number',
 'Date_Actually_Filed',
 'Internal_ID',
 'Entity_Size',
 'Priority_Date',
 'Status_Date',
 'Is_Priority',
 'Last_Related_Update',
 'Status',
 'Lawfirm',
 'Attorney',
 'Legal_Reference_Number',
 'Inventors',
 'Application_Type',
 'Created_Date',
 'Docket_No',
 'Expire_Date',
 'IP_Manager',
 'Lead_Inventor',
 'Licensing_Manager',
 'Primary_Key',
 'Publication_Date',
 'Technology_Key',
 'Licensed']

### 2.1 Feature Engineering

### Consolidate the "File_Date" and "Date_Actually_Filed" into a new column named "Actually_File_Date"

In [120]:
# We take "Date_Actually_Filed" as main column and combine "File_Date" which generate a new column named "Actually_File_Date"

df_pat["Actually_File_Date"] = df_pat["Date_Actually_Filed"].combine_first(df_pat["File_Date"])
df_pat.drop(["Date_Actually_Filed", "File_Date"], 1, inplace=True)

In [121]:
# The original columns "File_Date" has 36 missing values and "Date_Actually_Filed" has 1470 missing values

miss_num = df_pat["Actually_File_Date"].isnull().sum()
print("Actually_File_Date is missing:", miss_num)

Actually_File_Date is missing: 35


In [122]:
# Remove the 35 null records in "Actually_File_Date"

df_pat = df_pat[~df_pat['Actually_File_Date'].isna()]

In [123]:
# Convert "object" data type to "datetime"

df_pat['Actually_File_Date'] = pd.to_datetime(df_pat['Actually_File_Date'].astype(str),format='%m/%d/%Y')

In [124]:
# Confirm null records have been removed 

df_pat["Actually_File_Date"].isnull().sum()

0

### Split "Inventors" names and count the Number of Inventors for each patent

In [125]:
# N is the number of inventors for each patent, the range of N is [1,19] and the average of N is 3. 
# For modeling purpose, we keep the first 5 inventors and split into multi-columns

n = 5  
inventor_names = [f'Inventors_{i}' for i in range(n)]
df_new = df_pat['Inventors'].map(lambda x:(str(x).split(','),len(str(x).split(',')))).apply(pd.Series)
df_inventor = df_new[0].apply(lambda x:x[:n]).apply(pd.Series)
df_inventor.columns=inventor_names
df_inventor.head(10)

Unnamed: 0,Inventors_0,Inventors_1,Inventors_2,Inventors_3,Inventors_4
0,Roland Winston,,,,
1,Roland Winston,,,,
2,Roland Winston,,,,
3,Roland Winston,,,,
4,Roland Winston,,,,
5,Roland Winston,,,,
6,Roland Winston,,,,
7,Seongeun Julia Cho,Glyn Dawson,,,
8,Seongeun Julia Cho,Glyn Dawson,,,
9,Samuel Armato,Maryellen Giger,Heber MacMahon,,


In [126]:
# Create a column for counting the total number of inventors for each patent

df_cnt = pd.DataFrame(df_new[1])
df_cnt.columns=['Number_of_Inventors']
df_cnt.head(10)

Unnamed: 0,Number_of_Inventors
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,2
8,2
9,3


In [127]:
# Combining 'Number_of_Inventors' to the patent dataset

df_pat = pd.concat([df_pat, df_cnt],axis=1).drop("Inventors", axis = 1)
df_pat.head(2)

Unnamed: 0.1,Unnamed: 0,Tech_ID,Title,Serial_Number,Patent_Number,Country,Country_WIPO_ID,Issue_Date,Publication_Number,Internal_ID,...,Expire_Date,IP_Manager,Lead_Inventor,Licensing_Manager,Primary_Key,Publication_Date,Technology_Key,Licensed,Actually_File_Date,Number_of_Inventors
0,0,00-T-001,Light Collectors in Cylinderical Geometry,"05/581,613",3957031,*United States,US,5/18/1976,,0002.1-001,...,5/29/1995,"Ginsburg, Eric",Roland Winston,"Juggernauth, Anne",612078,,580932,Yes,1975-05-29,1
1,1,00-T-002,Radiant Energy Collector,"05/492,074",4002499,*United States,US,1/11/1977,,0002.2-001,...,7/26/1994,"Ginsburg, Eric",Roland Winston,"Juggernauth, Anne",614263,,580933,Yes,1974-07-26,1


### Create 'Patent_Status' column based on Guidance from Polsky Center: 

1 = Patented: Everything with a valid issue date + Status='Allowed', 'Issued', 'Granted', 'Authorized' 

2 = Pending: Status = 'Opposition', 'In Appeal', 'In Interference', 'Pending'

3 = Abandoned: Status= 'Abandoned', 'Expired - Inactive', 'Expired Prov - Filed PCT', 'Prosecution by Other Party', 'Released to Inventors', 'Released to Government'

4 = Ignore: Status= 'Expired PCT - Nationalized', 'Expired Prov - Filed in US', 'Expired PCT - Filed in US only', 'Registered (TM)', 'Registered (Copyright)'

In [128]:
# Comment out for now to focus on licensing status 
# # Generate lists for the patented, pending, and abandonded statuses
# patented = ('Allowed', 'Issued', 'Granted', 'Authorized')
# pending = ('Opposition', 'In Appeal', 'In Interference', 'Pending')
# abandoned = ('Abandoned', 'Expired - Inactive', 'Expired Prov - Filed PCT', 'Prosecution by Other Party', 'Released to Inventors', 'Released to Government')
# ignore = ('Expired PCT - Nationalized', 'Expired Prov - Filed in US', 'Expired PCT - Filed in US only', 'Registered (TM)', 'Registered (Copyright)')

# df_pat.loc[(df_pat['Issue_Date'].notnull()) | (df_pat['Status'].isin(patented)), 'Patent_Status'] = 'Patented'
# df_pat.loc[df_pat['Status'].isin(pending), 'Patent_Status'] = 'Pending'
# df_pat.loc[df_pat['Status'].isin(abandoned), 'Patent_Status'] = 'Abandoned'
# df_pat.loc[df_pat['Status'].isin(ignore), 'Patent_Status'] = 'Ignore'

In [129]:
# df_pat.Patent_Status.value_counts()

In [130]:
# #Drop 'Ignore' class
# #Combine 'Patented' and 'Pending' into 'fund' class
# #Rename 'Abandoned' to 'no_fund'
# keep = ('Patented', 'Pending', 'Abandoned')
# fund = ('Patented','Pending')

# df_pat = df_pat.loc[df_pat['Patent_Status'].isin(keep)]
# df_pat.loc[df_pat['Patent_Status'].isin(fund), 'Patent_Status'] = 'fund'
# df_pat.loc[df_pat['Patent_Status']=='Abandoned', 'Patent_Status'] = 'no_fund'

In [131]:
# df_pat.Patent_Status.value_counts()

In [132]:
# df_pat.loc[df_pat['Issue_Date'].notnull()]['Status'].value_counts()

In [133]:
# #The majority of inventions have a status of abandonded. Approximately 1,000 inventions have been successfully patented, 800 are pending, and 200 are not being reviewed for the purposes of this analysis

# df_pat['Patent_Status'].value_counts()

In [134]:
# # Here are the null values in the new 'Patent_Status' column matches the null values in the original 'Status' column, confirming we have classified all viable records
# # There are 6b records don't have a 'Patent_Status'

# print('Null values in Patent_Status column:', df_pat.Patent_Status.isnull().sum())
# print('Null values in Status column:', df_pat.Status.isnull().sum())

In [135]:
# Remove the null records in "Patent_Status"

df_pat = df_pat[~df_pat['Status'].isna()]

# Confirm removal 
df_pat.Status.isnull().sum()

0

In [136]:
# Final column list of Patent data before feature selection

df_pat.columns.to_list()

['Unnamed: 0',
 'Tech_ID',
 'Title',
 'Serial_Number',
 'Patent_Number',
 'Country',
 'Country_WIPO_ID',
 'Issue_Date',
 'Publication_Number',
 'Internal_ID',
 'Entity_Size',
 'Priority_Date',
 'Status_Date',
 'Is_Priority',
 'Last_Related_Update',
 'Status',
 'Lawfirm',
 'Attorney',
 'Legal_Reference_Number',
 'Application_Type',
 'Created_Date',
 'Docket_No',
 'Expire_Date',
 'IP_Manager',
 'Lead_Inventor',
 'Licensing_Manager',
 'Primary_Key',
 'Publication_Date',
 'Technology_Key',
 'Licensed',
 'Actually_File_Date',
 'Number_of_Inventors']

### 2.2 Drop Columns for Patent Data

In [137]:
columns_keep_pat = ['Tech_ID',
'Title',
'Country_WIPO_ID',
'Actually_File_Date',
'Is_Priority',
'Lawfirm',
'Attorney',
#'Patent_Status',
'Number_of_Inventors',
'Application_Type'
]

In [138]:
df_pat_keep = df_pat[['Tech_ID',
'Title',
'Country_WIPO_ID',
'Actually_File_Date',
'Is_Priority',
'Lawfirm',
'Attorney',
#'Patent_Status',
'Number_of_Inventors',
'Application_Type']]

In [139]:
df_pat_keep.rename(columns = {'Title':'Patent_Title'}, inplace = True)

In [140]:
# Check missing values in columns we keep and impute any null values with "Others"


df_pat_keep.isnull().sum()

Tech_ID                   0
Patent_Title              0
Country_WIPO_ID           0
Actually_File_Date        0
Is_Priority             323
Lawfirm                 263
Attorney               1865
Number_of_Inventors       0
Application_Type          0
dtype: int64

In [141]:
# Impute for columns with missing values 

df_pat_keep["Is_Priority"].fillna("Other", inplace = True)
df_pat_keep["Lawfirm"].fillna("Other", inplace = True)
df_pat_keep["Attorney"].fillna("Other", inplace = True)

df_pat_keep.isnull().sum()

Tech_ID                0
Patent_Title           0
Country_WIPO_ID        0
Actually_File_Date     0
Is_Priority            0
Lawfirm                0
Attorney               0
Number_of_Inventors    0
Application_Type       0
dtype: int64

In [142]:
# This is slightly higher than the count for patent modeling. This is because some records with a NA status in Patent_Status were removed
# We will not automatically remove these for the purposes of license modeling 
df_pat_keep.shape

(6802, 9)

### 3. Merge Tech and Patent datasets with columns_keep

In [310]:
# There are 5859 rows are matched with 41 columns ("Tech ID" will be dropped later)

df_modeling = df_pat_keep.join(
df_tech_keep.set_index(["Tech_ID"]),
on=["Tech_ID"],
how="inner",
lsuffix="_x",
rsuffix="_y")

df_modeling.shape

(5672, 36)

In [311]:
df_modeling.columns.to_list()

['Tech_ID',
 'Patent_Title',
 'Country_WIPO_ID',
 'Actually_File_Date',
 'Is_Priority',
 'Lawfirm',
 'Attorney',
 'Number_of_Inventors',
 'Application_Type',
 'Tech_Title',
 'Lead_Inventor',
 'Disclosure_Date',
 'Division_Department',
 'Primary_Division',
 'Owners',
 'Ability_of_investigator_to_continue_research',
 'Ability_to_advance_the_project_outside_the_lab',
 'Brief_Technology_Description',
 'Compelling_nature_of_data',
 'Detectability_of_infringement_and_enforceability',
 'Development_and_regulatory_path_for_the_product',
 'Freedom-to-operate_FTO_issues',
 'Historical_cooperation_or_not_of_investigator',
 'Identity_of_the_eventual_product',
 'Impact_of_patent_on_adoption_of_technology',
 'Industrial_startup_co-ownership_of_the_IP',
 'Institution',
 'Licensing_interest_by_a_specific_company',
 'Market_feedback',
 'Market_Size',
 'Nature_of_improvement_over_existing_art',
 'Patentability_questions',
 'Risk_cost_sharing_w_other_institution',
 'Size_of_Market',
 'Stage_of_research',

### Duplicate Detection and consolidation to unique records only

In [312]:
print("Unique Technology titles in the Modeling Dataset:", df_modeling['Tech_Title'].nunique())

print("Unique Patent titles in the Merged Dataset:", df_modeling['Patent_Title'].nunique())

print("Shape of the Merged dataset:", df_modeling.shape)

Unique Technology titles in the Modeling Dataset: 1053
Unique Patent titles in the Merged Dataset: 1853
Shape of the Merged dataset: (5672, 36)


In [313]:
# We see there are 4,768 records in the dataset that have the same Tech_Title/Patent_Title combination

duplicates = df_modeling[df_modeling.duplicated(subset=['Tech_Title','Patent_Title'], keep=False)]
duplicates.shape

(4646, 36)

### A majority of the duplication is coming from the Patents data (as expected). We have a large number of unique filing dates, countries, and application types

In [314]:
duplicates.head()

Unnamed: 0,Tech_ID,Patent_Title,Country_WIPO_ID,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,Tech_Title,...,Institution,Licensing_interest_by_a_specific_company,Market_feedback,Market_Size,Nature_of_improvement_over_existing_art,Patentability_questions,Risk_cost_sharing_w_other_institution,Size_of_Market,Stage_of_research,License_Status
5,00-T-006,Energy Transmission with Respect to Convex Sou...,US,1979-08-24,Yes,"Marshall, Gerstein & Borun",Other,1,US Utility,Energy Transmission with Respect to Convex Sou...,...,UCHI,No,No,,No,No,No,No,No,no_license
6,00-T-006,Energy Transmission with Respect to Convex Sou...,US,1981-07-10,No,"Marshall, Gerstein & Borun",Other,1,Continuation,Energy Transmission with Respect to Convex Sou...,...,UCHI,No,No,,No,No,No,No,No,no_license
15,00-T-010,Automated Method and System for the Segmentati...,US,2000-01-18,Yes,"Oblon, Spivak, McClelland, Maier & Neustadt","Kuesters, Eckhard",3,Provisional,Automated Method and System for the Segmentati...,...,UCHI,No,No,,No,No,No,No,No,license
17,00-T-010,Automated Method and System for the Segmentati...,US,2001-01-18,No,"Oblon, Spivak, McClelland, Maier & Neustadt","Kuesters, Eckhard",3,PCT,Automated Method and System for the Segmentati...,...,UCHI,No,No,,No,No,No,No,No,license
18,00-T-010,Automated Method and System for the Segmentati...,DE,2003-06-16,No,"Oblon, Spivak, McClelland, Maier & Neustadt",Other,3,Validated EPO,Automated Method and System for the Segmentati...,...,UCHI,No,No,,No,No,No,No,No,license


In [315]:
# This is the list of the 10 technologies with the most assocaited rows in the dataset

df_modeling.Tech_Title.value_counts().sort_values(ascending=False).nlargest(10)

Mini-Sgcg demonstrates feasibility for exon skipping to correct limb girdle muscular dystrophy                                                     136
Protein A as a Subunit Vaccine Against Staphylococcal Disease                                                                                      123
Substituted 06-Benzylguanines and 6(4)-Benzyloxypyrimidines                                                                                         82
Microfluidic Networks for Rapid Mixing and Non-Dispersing Time-Controlled Transport and Reactions of Multiple Chemical and Biochemical Reagents     67
Identifying Centromere Sequences from Plants                                                                                                        67
Bioelectrolytic formation of methane from CO2, water and electric power.                                                                            53
Use of FTY720 in autoimmune neuropathies and other neuromuscular disorders                    

In [316]:
# Review of column with missing data

df_modeling.isnull().sum()

Tech_ID                                                0
Patent_Title                                           0
Country_WIPO_ID                                        0
Actually_File_Date                                     0
Is_Priority                                            0
Lawfirm                                                0
Attorney                                               0
Number_of_Inventors                                    0
Application_Type                                       0
Tech_Title                                             0
Lead_Inventor                                          0
Disclosure_Date                                       77
Division_Department                                    0
Primary_Division                                       0
Owners                                              2406
Ability_of_investigator_to_continue_research           6
Ability_to_advance_the_project_outside_the_lab         7
Brief_Technology_Description   

### Consolidate "Application Type" to determine a unique patent

In [317]:
# Sort values by Disclosure Date and Actually Filed Date

df_modeling = df_modeling.sort_values(by=['Disclosure_Date','Actually_File_Date'])
df_modeling.head(10)

Unnamed: 0,Tech_ID,Patent_Title,Country_WIPO_ID,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,Tech_Title,...,Institution,Licensing_interest_by_a_specific_company,Market_feedback,Market_Size,Nature_of_improvement_over_existing_art,Patentability_questions,Risk_cost_sharing_w_other_institution,Size_of_Market,Stage_of_research,License_Status
6841,TEST01,thelma and cristi test disclosure,US,1900-01-02,Other,Alston & Bird,"Edwards, Jonathan",2,Provisional,DUMMY Cristi and Thelma test disclosure,...,UCHI,No,No,,No,No,No,No,No,no_license
4892,73-T-001,Radiant Energy Collection,US,1973-12-28,Yes,"Marshall, Gerstein & Borun","Borun, Michael",1,US Utility,Ideal Conical Shaped Light Collectors,...,UCHI,No,No,,No,No,No,No,No,no_license
4893,73-T-001,Radiant Energy Collection,US,1975-09-15,No,"Marshall, Gerstein & Borun","Borun, Michael",1,Continuation in Part,Ideal Conical Shaped Light Collectors,...,UCHI,No,No,,No,No,No,No,No,no_license
4894,75-T-001,Polysaccharide for Enhancement of Cardiac Output,US,1976-08-02,Yes,Other,Other,4,US Utility,Polysaccharide for Cardiac Enhancement,...,UCHI,No,No,,No,No,No,No,No,no_license
4899,77-T-002,Quaternary Derivatives of Noroxymorphone Which...,US,1978-07-28,Yes,Other,Other,4,US Utility,Quaternary Derivatives of Noroxymorphone Which...,...,UCHI,No,No,,No,No,No,No,No,license
4900,78-T-003,Methods and Materials for Detection of Estroph...,US,1978-09-22,Yes,Other,Other,2,US Utility,Methods and Materials for Detection of Estroph...,...,UCHI,No,No,,No,No,No,No,No,no_license
4904,79-T-006,Sextupole System for the Correction of Spheric...,US,1979-10-25,Yes,Other,Other,2,US Utility,Correction of Spherical Aberration,...,UCHI,No,No,,No,No,No,No,No,no_license
4906,79-T-010,Energy Transmission,US,1975-11-03,Yes,"Marshall, Gerstein & Borun",Other,1,US Utility,Reflecting Cavity for Transmitting Energy from...,...,UCHI,No,No,,No,No,No,No,No,no_license
4907,79-T-010,Energy Transmission,US,1975-12-17,No,"Marshall, Gerstein & Borun","Zeller, James",1,Continuation in Part,Reflecting Cavity for Transmitting Energy from...,...,UCHI,No,No,,No,No,No,No,No,no_license
4909,79-T-011,Parabolic Concentrator,US,1980-01-18,Yes,Other,Other,1,US Utility,Efficient Cavity with Directionally Scattering...,...,UCHI,No,No,,No,No,No,No,No,no_license


In [318]:
# Asjust Patent_Title column to lowercase all values and remove extra whitespace to avoid duplicates 

# lowercase 
df_modeling['Patent_Title'] = df_modeling['Patent_Title'].str.lower()
# remove extra white space 
df_modeling['Patent_Title'] = df_modeling['Patent_Title'].str.strip()

In [319]:
# Group by Tech_Title, Patent_Title, Disclosure_Date, and Country_WIPO_ID. 
# These columns indicate a unique record for the purposes of modeling  

df_modeling = df_modeling.groupby(['Tech_Title','Disclosure_Date','Country_WIPO_ID'])
df_modeling.head(10)

Unnamed: 0,Tech_ID,Patent_Title,Country_WIPO_ID,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,Tech_Title,...,Institution,Licensing_interest_by_a_specific_company,Market_feedback,Market_Size,Nature_of_improvement_over_existing_art,Patentability_questions,Risk_cost_sharing_w_other_institution,Size_of_Market,Stage_of_research,License_Status
6841,TEST01,thelma and cristi test disclosure,US,1900-01-02,Other,Alston & Bird,"Edwards, Jonathan",2,Provisional,DUMMY Cristi and Thelma test disclosure,...,UCHI,No,No,,No,No,No,No,No,no_license
4892,73-T-001,radiant energy collection,US,1973-12-28,Yes,"Marshall, Gerstein & Borun","Borun, Michael",1,US Utility,Ideal Conical Shaped Light Collectors,...,UCHI,No,No,,No,No,No,No,No,no_license
4893,73-T-001,radiant energy collection,US,1975-09-15,No,"Marshall, Gerstein & Borun","Borun, Michael",1,Continuation in Part,Ideal Conical Shaped Light Collectors,...,UCHI,No,No,,No,No,No,No,No,no_license
4894,75-T-001,polysaccharide for enhancement of cardiac output,US,1976-08-02,Yes,Other,Other,4,US Utility,Polysaccharide for Cardiac Enhancement,...,UCHI,No,No,,No,No,No,No,No,no_license
4899,77-T-002,quaternary derivatives of noroxymorphone which...,US,1978-07-28,Yes,Other,Other,4,US Utility,Quaternary Derivatives of Noroxymorphone Which...,...,UCHI,No,No,,No,No,No,No,No,license
4900,78-T-003,methods and materials for detection of estroph...,US,1978-09-22,Yes,Other,Other,2,US Utility,Methods and Materials for Detection of Estroph...,...,UCHI,No,No,,No,No,No,No,No,no_license
4904,79-T-006,sextupole system for the correction of spheric...,US,1979-10-25,Yes,Other,Other,2,US Utility,Correction of Spherical Aberration,...,UCHI,No,No,,No,No,No,No,No,no_license
4906,79-T-010,energy transmission,US,1975-11-03,Yes,"Marshall, Gerstein & Borun",Other,1,US Utility,Reflecting Cavity for Transmitting Energy from...,...,UCHI,No,No,,No,No,No,No,No,no_license
4907,79-T-010,energy transmission,US,1975-12-17,No,"Marshall, Gerstein & Borun","Zeller, James",1,Continuation in Part,Reflecting Cavity for Transmitting Energy from...,...,UCHI,No,No,,No,No,No,No,No,no_license
4909,79-T-011,parabolic concentrator,US,1980-01-18,Yes,Other,Other,1,US Utility,Efficient Cavity with Directionally Scattering...,...,UCHI,No,No,,No,No,No,No,No,no_license


In [320]:
# Take the first available Actually_Filed_Date from that unique entry

df_modeling = df_modeling.first().reset_index()

In [321]:
df_modeling.shape

(3172, 36)

In [322]:
# Drop one test row "TEST01" and then drop the "Tech_ID" column

df_modeling = df_modeling[df_modeling["Tech_ID"].str.contains("TEST01") == False]
  
df_modeling.shape

(3171, 36)

In [323]:
df_modeling = df_modeling.drop(columns=['Tech_ID'])

In [324]:
# check the types of Application_Type values to determine if it looks like we are keeping to correct applications

df_modeling['Application_Type'].value_counts()

Nationalized PCT         1209
Validated EPO             838
Provisional               751
US Utility                242
Foreign, Non-PCT           74
PCT                        16
Continuation in Part       15
Divisional                 14
Continuation                7
Trademark                   3
Copyright Application       2
Name: Application_Type, dtype: int64

### Incorporate LDA Topic modeling to add to columns 

In [325]:
# Prepare stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [326]:
# Convert Tech_Titles to list and tokenize
data = df_modeling.Tech_Title.values.tolist()

def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['grid', 'cassette', 'for', 'portable', 'radiography']]


In [327]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['grid', 'cassette', 'for', 'portable', 'radiography']


In [328]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [329]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['grid', 'cassette', 'portable', 'radiography']]


In [330]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [331]:
# Develop LDA Topic Model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [332]:
# Print the keyword of topics
pprint(lda_model.print_topics())
# This applies the lda model to our corpus of titles, which we can use to assign a majority topic for each Tech_Topic 
doc_lda = lda_model[corpus]

[(0,
  '0.055*"treatment" + 0.040*"cell" + 0.034*"cancer" + 0.023*"opioid" + '
  '0.021*"antagonist" + 0.020*"tumor" + 0.020*"growth" + 0.018*"anti" + '
  '0.015*"attenuate" + 0.014*"agent"'),
 (1,
  '0.044*"method" + 0.040*"protein" + 0.037*"disease" + 0.031*"human" + '
  '0.029*"virus" + 0.025*"system" + 0.023*"receptor" + 0.018*"staphylococcal" '
  '+ 0.018*"gene" + 0.016*"treat"'),
 (2,
  '0.123*"use" + 0.031*"herpe" + 0.028*"new" + 0.027*"optical" + '
  '0.025*"vaccine" + 0.023*"neuromuscular_disorder" + 0.015*"synthetic" + '
  '0.014*"radio_sensitizer" + 0.013*"type" + 0.012*"staphylococcus_aureus"')]


In [333]:
# Compute Model Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.


Perplexity:  -7.190292562435039


In [334]:
# Visualize the topics
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds')
vis

In [335]:
# get list of tuples containing topic percentages from each Tech_Title from doc_lda
topics = pd.DataFrame(doc_lda)

# Extract the 2nd element (the percentages for each given topic) from each column containing list element
topics['topic1'] = list(zip(*topics[0]))[0]
topics['topic2'] = list(zip(*topics[0]))[1]
topics['topic3'] = list(zip(*topics[0]))[2]
#topics['topic4'] = list(zip(*topics[0]))[3]

topics['topic1'] = list(zip(*topics['topic1']))[1]
topics['topic2'] = list(zip(*topics['topic2']))[1]
topics['topic3'] = list(zip(*topics['topic3']))[1]
#topics['topic4'] = list(zip(*topics['topic4']))[1]

# convert series objects to float for comparison 
topics['topic1'] = topics['topic1'].astype(str).astype(float)
topics['topic2'] = topics['topic2'].astype(str).astype(float)
topics['topic3'] = topics['topic3'].astype(str).astype(float)
#topics['topic4'] = topics['topic4'].astype(str).astype(float)

# create 'lda_topic' column with topic that carries majority weight for Tech_Title
topics.loc[(topics['topic1']>topics['topic2']) & (topics['topic1']>topics['topic3']), 'lda_topic'] = 1
topics.loc[(topics['topic2']>topics['topic1']) & (topics['topic2']>topics['topic3']), 'lda_topic'] = 2
topics.loc[(topics['topic3']>topics['topic1']) & (topics['topic3']>topics['topic2']), 'lda_topic'] = 3
#topics.loc[(topics['topic4']>topics['topic1']) & (topics['topic4']>topics['topic2']) & (topics['topic4']>topics['topic3']), 'lda_topic'] = 4

topics.drop(columns=[0,1,2],inplace = True)

# Check results 
topics.head(7)

Unnamed: 0,topic1,topic2,topic3,lda_topic
0,0.052483,0.872408,0.075109,2.0
1,0.029761,0.731949,0.23829,2.0
2,0.044284,0.426866,0.52885,3.0
3,0.861714,0.067595,0.070691,1.0
4,0.861715,0.067595,0.07069,1.0
5,0.861715,0.067595,0.07069,1.0
6,0.861715,0.067595,0.07069,1.0


### Topics are relatively evenly distributed across the 4 categories

In [336]:
topics['lda_topic'].value_counts()/len(topics)*100

2.0    36.959950
3.0    35.982340
1.0    27.057711
Name: lda_topic, dtype: float64

In [337]:
#bind with main dataset
df_modeling = pd.concat([df_modeling, topics], axis=1)
df_modeling.head(7)

Unnamed: 0,Tech_Title,Disclosure_Date,Country_WIPO_ID,Patent_Title,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,...,Nature_of_improvement_over_existing_art,Patentability_questions,Risk_cost_sharing_w_other_institution,Size_of_Market,Stage_of_research,License_Status,topic1,topic2,topic3,lda_topic
0,"""Grid Cassette for Portable Radiography""",1991-08-08 00:00:00,US,optical grid alignment system for portable rad...,1991-12-02,Yes,Norton Rose Fulbright US LLP,Other,1.0,US Utility,...,No,No,No,No,No,no_license,0.052483,0.872408,0.075109,2.0
1,"""Index of Depth of Anesthesia - Brain Anesthet...",2001-03-16 00:00:00,US,assessment of concentration of inhalational co...,2001-03-16,Yes,"Cesari and McKenna, LLP","Tsao, Y. Rocky",1.0,US Utility,...,No,No,No,No,No,no_license,0.029761,0.731949,0.23829,2.0
2,"""TrueQ"" Microspheres for Quantitative Flow Cyt...",2011-06-01 10:00:00,US,oligonucleotide-mediated quantitative multiple...,2012-06-15,Yes,Norton Rose Fulbright US LLP,"Shishima, Gina",3.0,Provisional,...,No,No,No,No,No,license,0.044284,0.426866,0.52885,3.0
3,-3156G>A Genotyping for Optimization of Cancer...,2003-06-06 10:00:00,BE,methods and compositions for predicting irinot...,2008-05-28,No,Norton Rose Fulbright US LLP,"Shishima, Gina",4.0,Validated EPO,...,No,No,No,No,No,license,0.861714,0.067595,0.070691,1.0
4,-3156G>A Genotyping for Optimization of Cancer...,2003-06-06 10:00:00,CA,methods and compositions for predicting irinot...,2005-11-29,No,Norton Rose Fulbright US LLP,"Shishima, Gina",4.0,Nationalized PCT,...,No,No,No,No,No,license,0.861715,0.067595,0.07069,1.0
5,-3156G>A Genotyping for Optimization of Cancer...,2003-06-06 10:00:00,DE,methods and compositions for predicting irinot...,2008-05-28,No,Norton Rose Fulbright US LLP,"Shishima, Gina",4.0,Validated EPO,...,No,No,No,No,No,license,0.861715,0.067595,0.07069,1.0
6,-3156G>A Genotyping for Optimization of Cancer...,2003-06-06 10:00:00,EP,methods and compositions for predicting irinot...,2005-12-01,No,Norton Rose Fulbright US LLP,"Shishima, Gina",4.0,Nationalized PCT,...,No,No,No,No,No,license,0.861715,0.067595,0.07069,1.0


### More Feature Engineering

In [338]:
df_modeling['Primary_Division'].value_counts()

BSD      2408
PSD       564
ANL        89
PME        88
MBL        14
Other       8
Name: Primary_Division, dtype: int64

In [339]:
#Since MBL does not have many patent applications, combine it into 'Other'
df_modeling.loc[df_modeling['Primary_Division']=='MBL','Primary_Division'] = 'Other'

In [340]:
df_modeling['Primary_Division'].value_counts()

BSD      2408
PSD       564
ANL        89
PME        88
Other      22
Name: Primary_Division, dtype: int64

In [341]:
#Create licence counts by primary division
num_licenses_division = df_modeling['Primary_Division'].value_counts().reindex(
    df_modeling.Primary_Division.unique(), fill_value=0)
num_success_licenses_division = df_modeling.loc[df_modeling['License_Status']=='license']['Primary_Division'].value_counts().reindex(
    df_modeling.Primary_Division.unique(), fill_value=0)
licenses_by_division = pd.DataFrame({'Primary_Division':num_licenses_division.index, 'Licenses_in_Division':num_licenses_division.values, 'Successful Licenses_in_Division':num_success_licenses_division.values})


In [342]:
licenses_by_division

Unnamed: 0,Primary_Division,Licenses_in_Division,Successful Licenses_in_Division
0,BSD,2408,1037
1,PSD,564,317
2,PME,88,83
3,Other,22,11
4,,0,0
5,ANL,89,0


In [343]:
#Create patent success rate by primary division and drop NaN row
licenses_by_division['Division_License_Success_Rate'] = licenses_by_division['Successful Licenses_in_Division']/licenses_by_division['Licenses_in_Division']
licenses_by_division = licenses_by_division.drop([4])
licenses_by_division

Unnamed: 0,Primary_Division,Licenses_in_Division,Successful Licenses_in_Division,Division_License_Success_Rate
0,BSD,2408,1037,0.430648
1,PSD,564,317,0.562057
2,PME,88,83,0.943182
3,Other,22,11,0.5
5,ANL,89,0,0.0


In [344]:
df_modeling = df_modeling.merge(licenses_by_division)

In [345]:
#Create patent counts by tech family
num_licenses_tech = df_modeling['Tech_Title'].value_counts()
licenses_by_tech = pd.DataFrame({'Tech_Title':num_licenses_tech.index, 'Licenses_in_Tech':num_licenses_tech.values})
licenses_by_tech

Unnamed: 0,Tech_Title,Licenses_in_Tech
0,Mini-Sgcg demonstrates feasibility for exon sk...,90
1,Protein A as a Subunit Vaccine Against Staphyl...,50
2,Use of FTY720 in autoimmune neuropathies and o...,42
3,Modified alpha-galactosyl ceramide for stainin...,37
4,Multicomponet Intestinal Preparation Solution ...,33
5,Staphylococcal Protein A Contributes to Persis...,31
6,Biological process for converting carbon dioxi...,28
7,Modulators of Nuclear Receptor Liver X Recepto...,24
8,Hybrid Coa and vWbp subunit vaccine derived fr...,24
9,Treatment of Cancer by Manipulation of Commens...,23


In [346]:
df_modeling = df_modeling.merge(licenses_by_tech)
df_modeling.head()

Unnamed: 0,Tech_Title,Disclosure_Date,Country_WIPO_ID,Patent_Title,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,...,Stage_of_research,License_Status,topic1,topic2,topic3,lda_topic,Licenses_in_Division,Successful Licenses_in_Division,Division_License_Success_Rate,Licenses_in_Tech
0,"""Grid Cassette for Portable Radiography""",1991-08-08 00:00:00,US,optical grid alignment system for portable rad...,1991-12-02,Yes,Norton Rose Fulbright US LLP,Other,1.0,US Utility,...,No,no_license,0.052483,0.872408,0.075109,2.0,2408,1037,0.430648,1
1,"""Index of Depth of Anesthesia - Brain Anesthet...",2001-03-16 00:00:00,US,assessment of concentration of inhalational co...,2001-03-16,Yes,"Cesari and McKenna, LLP","Tsao, Y. Rocky",1.0,US Utility,...,No,no_license,0.029761,0.731949,0.23829,2.0,2408,1037,0.430648,1
2,"""TrueQ"" Microspheres for Quantitative Flow Cyt...",2011-06-01 10:00:00,US,oligonucleotide-mediated quantitative multiple...,2012-06-15,Yes,Norton Rose Fulbright US LLP,"Shishima, Gina",3.0,Provisional,...,No,license,0.044284,0.426866,0.52885,3.0,2408,1037,0.430648,1
3,-3156G>A Genotyping for Optimization of Cancer...,2003-06-06 10:00:00,BE,methods and compositions for predicting irinot...,2008-05-28,No,Norton Rose Fulbright US LLP,"Shishima, Gina",4.0,Validated EPO,...,No,license,0.861714,0.067595,0.070691,1.0,2408,1037,0.430648,9
4,-3156G>A Genotyping for Optimization of Cancer...,2003-06-06 10:00:00,CA,methods and compositions for predicting irinot...,2005-11-29,No,Norton Rose Fulbright US LLP,"Shishima, Gina",4.0,Nationalized PCT,...,No,license,0.861715,0.067595,0.07069,1.0,2408,1037,0.430648,9


In [347]:
# Get time delta between disclosure date and date actually filed 
df_modeling['Actually_File_Date'] = pd.to_datetime(df_modeling['Actually_File_Date'])
df_modeling['Disclosure_Date'] = pd.to_datetime(df_modeling['Disclosure_Date'])

df_modeling['Disclosure_to_Filing'] = (df_modeling['Actually_File_Date'] - df_modeling['Disclosure_Date']).astype('timedelta64[D]')

### I think we need to remove this feature. If the technology is the first patent in a family, it will be scored unfairly low 

In [348]:
# #Create patent success rate by tech family
# num_success_tech = df_modeling.loc[df_modeling['Patent_Status']==1]['Tech_Title'].value_counts()
# success_patent_by_tech = pd.DataFrame({'Tech_Title':num_success_tech.index, 'Success_Patents_in_Tech':num_success_tech.values})
# df_modeling = df_modeling.merge(success_patent_by_tech, how='left').fillna(0)
# df_modeling['Tech_Family_Patent_Success_Rate'] = df_modeling['Success_Patents_in_Tech']/df_modeling['Patents_in_Tech']
# df_modeling = df_modeling.drop(['Success_Patents_in_Tech'],axis=1)

### Imputation of null values not subject to KNN Imputation

In [349]:
df_modeling.isnull().sum()

Tech_Title                                             0
Disclosure_Date                                        0
Country_WIPO_ID                                        0
Patent_Title                                           0
Actually_File_Date                                     0
Is_Priority                                            0
Lawfirm                                                0
Attorney                                               0
Number_of_Inventors                                    0
Application_Type                                       0
Lead_Inventor                                          0
Division_Department                                    0
Primary_Division                                       0
Owners                                              1281
Ability_of_investigator_to_continue_research           1
Ability_to_advance_the_project_outside_the_lab         2
Brief_Technology_Description                           0
Compelling_nature_of_data      

In [350]:
# Drop row where topics are null 
df_modeling = df_modeling.dropna(subset=['topic1'])
df_modeling = df_modeling.dropna(subset=['topic2'])
df_modeling = df_modeling.dropna(subset=['topic3'])

# Convert Owners with null value to "not listed"
df_modeling.Owners = df_modeling.Owners.fillna('Not_Listed')

# Drop Brief_Assessment_Description, Patent_Title, and Tech_Title, as they are description fields and will not add value for our modeling 
df_modeling.drop('Brief_Technology_Description', axis=1, inplace=True)
df_modeling.drop('Tech_Title', axis=1, inplace=True)
df_modeling.drop('Patent_Title', axis=1, inplace=True)

# Convert Institution with null value to "Other"
df_modeling.Institution = df_modeling.Institution.fillna('Other')

# Market_Size has too many unique values, so we will remove 
df_modeling.drop('Market_Size', axis=1, inplace=True)

# Split out dataset containing records with Disclosure date beyond 2012
df_modeling_2012 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(2012,1,1))]  #last 10-year records

# Save Patent Status and remove from dataset for now (we will add back in after scaling)
license_status = pd.DataFrame(df_modeling, columns=['License_Status']) 
df_modeling.drop('License_Status', axis=1, inplace=True)

license_status_2012 = pd.DataFrame(df_modeling_2012, columns=['License_Status'])
license_status_2012 = license_status_2012.reset_index(drop=True)
df_modeling_2012.drop('License_Status', axis=1, inplace=True)

### Convert categorical variables using integer encoding 

In [353]:
# date fields
df_modeling['Disclosure_Date'] = df_modeling['Disclosure_Date'].values.astype(float)
df_modeling['Actually_File_Date'] = df_modeling['Actually_File_Date'].values.astype(float)
df_modeling['Country_WIPO_ID'] =df_modeling['Country_WIPO_ID'].astype('category').cat.codes
df_modeling['Is_Priority'] =df_modeling['Is_Priority'].astype('category').cat.codes
df_modeling['Lawfirm'] =df_modeling['Lawfirm'].astype('category').cat.codes
df_modeling['Attorney'] =df_modeling['Attorney'].astype('category').cat.codes
df_modeling['Application_Type'] =df_modeling['Application_Type'].astype('category').cat.codes
df_modeling['Lead_Inventor'] =df_modeling['Lead_Inventor'].astype('category').cat.codes
df_modeling['Division_Department'] =df_modeling['Division_Department'].astype('category').cat.codes
df_modeling['Primary_Division'] =df_modeling['Primary_Division'].astype('category').cat.codes
df_modeling['Owners'] =df_modeling['Owners'].astype('category').cat.codes
df_modeling['Ability_of_investigator_to_continue_research'] =df_modeling['Ability_of_investigator_to_continue_research'].astype('category').cat.codes
df_modeling['Ability_to_advance_the_project_outside_the_lab'] =df_modeling['Ability_to_advance_the_project_outside_the_lab'].astype('category').cat.codes
df_modeling['Compelling_nature_of_data'] =df_modeling['Compelling_nature_of_data'].astype('category').cat.codes
df_modeling['Detectability_of_infringement_and_enforceability'] =df_modeling['Detectability_of_infringement_and_enforceability'].astype('category').cat.codes
df_modeling['Development_and_regulatory_path_for_the_product'] =df_modeling['Development_and_regulatory_path_for_the_product'].astype('category').cat.codes
df_modeling['Freedom-to-operate_FTO_issues'] =df_modeling['Freedom-to-operate_FTO_issues'].astype('category').cat.codes
df_modeling['Historical_cooperation_or_not_of_investigator'] =df_modeling['Historical_cooperation_or_not_of_investigator'].astype('category').cat.codes
df_modeling['Identity_of_the_eventual_product'] =df_modeling['Identity_of_the_eventual_product'].astype('category').cat.codes
df_modeling['Impact_of_patent_on_adoption_of_technology'] =df_modeling['Impact_of_patent_on_adoption_of_technology'].astype('category').cat.codes
df_modeling['Industrial_startup_co-ownership_of_the_IP'] =df_modeling['Industrial_startup_co-ownership_of_the_IP'].astype('category').cat.codes
df_modeling['Institution'] =df_modeling['Institution'].astype('category').cat.codes
df_modeling['Licensing_interest_by_a_specific_company'] =df_modeling['Licensing_interest_by_a_specific_company'].astype('category').cat.codes
df_modeling['Market_feedback'] =df_modeling['Market_feedback'].astype('category').cat.codes
df_modeling['Nature_of_improvement_over_existing_art'] =df_modeling['Nature_of_improvement_over_existing_art'].astype('category').cat.codes
df_modeling['Patentability_questions'] =df_modeling['Patentability_questions'].astype('category').cat.codes
df_modeling['Risk_cost_sharing_w_other_institution'] =df_modeling['Risk_cost_sharing_w_other_institution'].astype('category').cat.codes
df_modeling['Size_of_Market'] =df_modeling['Size_of_Market'].astype('category').cat.codes
df_modeling['Stage_of_research'] =df_modeling['Stage_of_research'].astype('category').cat.codes

In [354]:
# date fields
df_modeling_2012['Disclosure_Date'] = df_modeling_2012['Disclosure_Date'].values.astype(float)
df_modeling_2012['Actually_File_Date'] = df_modeling_2012['Actually_File_Date'].values.astype(float)
df_modeling_2012['Country_WIPO_ID'] =df_modeling_2012['Country_WIPO_ID'].astype('category').cat.codes
df_modeling_2012['Is_Priority'] =df_modeling_2012['Is_Priority'].astype('category').cat.codes
df_modeling_2012['Lawfirm'] =df_modeling_2012['Lawfirm'].astype('category').cat.codes
df_modeling_2012['Attorney'] =df_modeling_2012['Attorney'].astype('category').cat.codes
df_modeling_2012['Application_Type'] =df_modeling_2012['Application_Type'].astype('category').cat.codes
df_modeling_2012['Lead_Inventor'] =df_modeling_2012['Lead_Inventor'].astype('category').cat.codes
df_modeling_2012['Division_Department'] =df_modeling_2012['Division_Department'].astype('category').cat.codes
df_modeling_2012['Primary_Division'] =df_modeling_2012['Primary_Division'].astype('category').cat.codes
df_modeling_2012['Owners'] =df_modeling_2012['Owners'].astype('category').cat.codes
df_modeling_2012['Ability_of_investigator_to_continue_research'] =df_modeling_2012['Ability_of_investigator_to_continue_research'].astype('category').cat.codes
df_modeling_2012['Ability_to_advance_the_project_outside_the_lab'] =df_modeling_2012['Ability_to_advance_the_project_outside_the_lab'].astype('category').cat.codes
df_modeling_2012['Compelling_nature_of_data'] =df_modeling_2012['Compelling_nature_of_data'].astype('category').cat.codes
df_modeling_2012['Detectability_of_infringement_and_enforceability'] =df_modeling_2012['Detectability_of_infringement_and_enforceability'].astype('category').cat.codes
df_modeling_2012['Development_and_regulatory_path_for_the_product'] =df_modeling_2012['Development_and_regulatory_path_for_the_product'].astype('category').cat.codes
df_modeling_2012['Freedom-to-operate_FTO_issues'] =df_modeling_2012['Freedom-to-operate_FTO_issues'].astype('category').cat.codes
df_modeling_2012['Historical_cooperation_or_not_of_investigator'] =df_modeling_2012['Historical_cooperation_or_not_of_investigator'].astype('category').cat.codes
df_modeling_2012['Identity_of_the_eventual_product'] =df_modeling_2012['Identity_of_the_eventual_product'].astype('category').cat.codes
df_modeling_2012['Impact_of_patent_on_adoption_of_technology'] =df_modeling_2012['Impact_of_patent_on_adoption_of_technology'].astype('category').cat.codes
df_modeling_2012['Industrial_startup_co-ownership_of_the_IP'] =df_modeling_2012['Industrial_startup_co-ownership_of_the_IP'].astype('category').cat.codes
df_modeling_2012['Institution'] =df_modeling_2012['Institution'].astype('category').cat.codes
df_modeling_2012['Licensing_interest_by_a_specific_company'] =df_modeling_2012['Licensing_interest_by_a_specific_company'].astype('category').cat.codes
df_modeling_2012['Market_feedback'] =df_modeling_2012['Market_feedback'].astype('category').cat.codes
df_modeling_2012['Nature_of_improvement_over_existing_art'] =df_modeling_2012['Nature_of_improvement_over_existing_art'].astype('category').cat.codes
df_modeling_2012['Patentability_questions'] =df_modeling_2012['Patentability_questions'].astype('category').cat.codes
df_modeling_2012['Risk_cost_sharing_w_other_institution'] =df_modeling_2012['Risk_cost_sharing_w_other_institution'].astype('category').cat.codes
df_modeling_2012['Size_of_Market'] =df_modeling_2012['Size_of_Market'].astype('category').cat.codes
df_modeling_2012['Stage_of_research'] =df_modeling_2012['Stage_of_research'].astype('category').cat.codes

### Scale data

In [355]:
df_modeling.head()

Unnamed: 0,Disclosure_Date,Country_WIPO_ID,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,Lead_Inventor,Division_Department,...,Stage_of_research,topic1,topic2,topic3,lda_topic,Licenses_in_Division,Successful Licenses_in_Division,Division_License_Success_Rate,Licenses_in_Tech,Disclosure_to_Filing
0,6.816096e+17,81,6.91632e+17,2,68,94,1.0,9,109,95,...,0,0.052483,0.872408,0.075109,2.0,2408,1037,0.430648,1,116.0
1,9.847008e+17,81,9.847008e+17,2,15,126,1.0,9,48,1,...,0,0.029761,0.731949,0.23829,2.0,2408,1037,0.430648,1,0.0
2,1.306922e+18,81,1.339718e+18,2,68,116,3.0,7,287,112,...,0,0.044284,0.426866,0.52885,3.0,2408,1037,0.430648,1,379.0
3,1.054894e+18,5,1.211933e+18,0,68,116,4.0,10,194,61,...,0,0.861714,0.067595,0.070691,1.0,2408,1037,0.430648,9,1817.0
4,1.054894e+18,9,1.133222e+18,0,68,116,4.0,5,194,61,...,0,0.861715,0.067595,0.07069,1.0,2408,1037,0.430648,9,906.0


In [356]:
scaler = MinMaxScaler()
df_modeling = pd.DataFrame(scaler.fit_transform(df_modeling), columns = df_modeling.columns)
df_modeling_2012 = pd.DataFrame(scaler.fit_transform(df_modeling_2012), columns = df_modeling_2012.columns)
df_modeling.head()

Unnamed: 0,Disclosure_Date,Country_WIPO_ID,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,Lead_Inventor,Division_Department,...,Stage_of_research,topic1,topic2,topic3,lda_topic,Licenses_in_Division,Successful Licenses_in_Division,Division_License_Success_Rate,Licenses_in_Tech,Disclosure_to_Filing
0,0.372629,0.94186,0.370048,1.0,0.701031,0.657343,0.0,0.9,0.331307,0.633333,...,0.5,0.041752,0.897451,0.064076,0.5,1.0,1.0,0.45659,0.0,0.510944
1,0.571259,0.94186,0.561741,1.0,0.154639,0.881119,0.0,0.9,0.145897,0.006667,...,0.5,0.018005,0.750324,0.238005,0.5,1.0,1.0,0.45659,0.0,0.50278
2,0.782425,0.94186,0.793953,1.0,0.701031,0.811189,0.111111,0.7,0.87234,0.746667,...,0.5,0.033183,0.430758,0.547701,1.0,1.0,1.0,0.45659,0.0,0.529453
3,0.617259,0.05814,0.71037,0.0,0.701031,0.811189,0.166667,1.0,0.589666,0.406667,...,0.5,0.88748,0.054432,0.059367,0.0,1.0,1.0,0.45659,0.089888,0.630657
4,0.617259,0.104651,0.658887,0.0,0.701031,0.811189,0.166667,0.5,0.589666,0.406667,...,0.5,0.887481,0.054432,0.059366,0.0,1.0,1.0,0.45659,0.089888,0.566542


### Conduct KNN Imputation

Resources: https://medium.com/@kyawsawhtoon/a-guide-to-knn-imputation-95e2dc496e

In [357]:
imputer = KNNImputer(n_neighbors=5)
df_modeling = pd.DataFrame(imputer.fit_transform(df_modeling),columns = df_modeling.columns)
df_modeling_2012 = pd.DataFrame(imputer.fit_transform(df_modeling_2012),columns = df_modeling_2012.columns)

In [358]:
df_modeling.isnull().sum()

Disclosure_Date                                     0
Country_WIPO_ID                                     0
Actually_File_Date                                  0
Is_Priority                                         0
Lawfirm                                             0
Attorney                                            0
Number_of_Inventors                                 0
Application_Type                                    0
Lead_Inventor                                       0
Division_Department                                 0
Primary_Division                                    0
Owners                                              0
Ability_of_investigator_to_continue_research        0
Ability_to_advance_the_project_outside_the_lab      0
Compelling_nature_of_data                           0
Detectability_of_infringement_and_enforceability    0
Development_and_regulatory_path_for_the_product     0
Freedom-to-operate_FTO_issues                       0
Historical_cooperation_or_no

In [359]:
# Merge patent status back onto modeling dataset 
df_modeling = df_modeling.join(license_status)
df_modeling_2012 = df_modeling_2012.join(license_status_2012)
df_modeling_2012.head()

Unnamed: 0,Disclosure_Date,Country_WIPO_ID,Actually_File_Date,Is_Priority,Lawfirm,Attorney,Number_of_Inventors,Application_Type,Lead_Inventor,Division_Department,...,topic1,topic2,topic3,lda_topic,Licenses_in_Division,Successful Licenses_in_Division,Division_License_Success_Rate,Licenses_in_Tech,Disclosure_to_Filing,License_Status
0,0.620401,0.941176,0.041535,1.0,0.647059,0.784314,0.2,0.571429,0.88764,0.645161,...,0.034306,0.427883,0.550959,1.0,1.0,1.0,0.0,0.0,0.162785,license
1,0.725569,0.941176,0.199264,1.0,0.490196,0.509804,0.2,0.571429,0.331461,0.44086,...,0.032536,0.038858,0.950607,1.0,1.0,1.0,0.0,0.0,0.104683,license
2,0.800943,0.941176,0.42245,1.0,0.156863,0.843137,0.2,0.571429,0.926966,0.268817,...,0.73632,0.176054,0.108932,0.0,1.0,1.0,0.0,0.0,0.115452,no_license
3,0.700081,0.941176,0.153785,1.0,0.647059,0.784314,0.4,0.571429,0.853933,0.075269,...,0.178352,0.10811,0.734465,1.0,1.0,1.0,0.0,0.0,0.115327,no_license
4,0.866637,0.941176,0.640641,0.5,0.490196,0.509804,0.5,0.571429,0.331461,0.44086,...,0.20541,0.393692,0.415415,1.0,1.0,1.0,0.0,0.0,0.136113,license


In [364]:
#df_modeling_2012.to_csv('modeling_license_2012.csv')

In [94]:
# df_modeling_2012 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(2012,1,1))]  last 10-year records
# df_modeling_2007 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(2007,1,1))]  last 15-year records
# df_modeling_2002 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(2002,1,1))]  last 20-year records
# df_modeling_1997 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(1997,1,1))]  last 25-year records
# df_modeling_1992 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(1992,1,1))]  last 30-year records

# print("Actually File Date after 2012:", df_modeling_2012.shape)  (1267, 41)
# print("Actually File Date after 2007:", df_modeling_2007.shape)  (1768, 41)
# print("Actually File Date after 2002:", df_modeling_2002.shape)  (2288, 41)
# print("Actually File Date after 1997:", df_modeling_1997.shape)  (2696, 41)
# print("Actually File Date after 1992:", df_modeling_1992.shape)  (3067, 41)

#### Why do we see such a hike in licensed inventions once the patent dataset is included 

In [365]:
df_modeling_2012['License_Status'].value_counts()/len(df_modeling_2012)*100

license       78.104027
no_license    21.895973
Name: License_Status, dtype: float64

In [96]:
df_tech['License_Status'].value_counts()/len(df_tech)*100

no_license    71.428571
license       28.571429
Name: License_Status, dtype: float64