In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import missingno as msno
import plotly.graph_objects as go
from datetime import date
%matplotlib inline


from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

# topic modeling packages 
import nltk
#nltk.download('stopwords')
import re
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel# spaCy for preprocessing
from gensim import similarities
import spacy# Plotting tools
import pyLDAvis
from matplotlib import pyplot

from pprint import pprint
import warnings
#warnings.filterwarnings("ignore")
import pickle

# sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, metrics
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix,classification_report, precision_recall_curve, f1_score, auc
from xgboost import XGBClassifier
import statsmodels.api as sm

# imbelearn package
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)

  and should_run_async(code)


ImportError: cannot import name '_centered' from 'scipy.signal.signaltools' (/opt/anaconda3/lib/python3.8/site-packages/scipy/signal/signaltools.py)

## 1. Tech Dataset

In [None]:
df_tech = pd.DataFrame(pd.read_csv("Tech_cleaned.csv"))
print("Technology Shape:", df_tech.shape)

### 1.1 Feature Engineering

In [None]:
# Filter to only patent IP_Type
df_tech = df_tech.loc[df_tech['IP_Type']=='Patent'].reset_index()
df_tech.shape

In [None]:
# Only keeping major divisions with sufficient number of patents; in case of multiple divisions, search through the
# list of divisions in the order of the following dict
division_mapping = {'PSD':'PSD', 
                    'BSD':'BSD', 
                    'PME':'PME', 
                    'Argonne National Laboratory':'ANL', 
                    'Marine Biological Laboratory':'MBL', 
                    'Booth':'Booth', 
                    'University of Chicago Hospital':'UCH', 
                    'SSD':'SSD', 
                    'Comprehensive Cancer Center':'CCC', 
                    'University of Chicago':'UC',
                    'Toyota Technological Institute':'TTI', 
                    'Humanities':'Humanities', 
                    'Harris':'Harris',
                    'Institute of Politics':'Politics'}

df_tech.loc[df_tech['Division_Department'].isnull()] = 'NA'

df_tech['Primary_Division'] = 'Others'
df_tech.loc[df_tech['Division_Department']=='NA', 'Primary_Division'] = 'NA'

for i in range(len(df_tech)):
    for key in division_mapping:
        if key in df_tech['Division_Department'][i]:
            df_tech['Primary_Division'][i] = division_mapping.get(key)
            break;
            
df_tech[['Division_Department','Primary_Division']].head(15)

In [None]:
df_tech['Primary_Division'].value_counts()/len(df_tech)*100

In [None]:
# Any Divisions with less than 0.5% of total records will be consolidated into 'Other' category 
other = ('Humanities', 'SSD', 'UCH', 'Booth', 'UC', 'Harris', 'TTI', 'CCC', 'Others')

df_tech.loc[df_tech['Primary_Division'].isin(other), 'Primary_Division'] = 'Other'

# Confrim changes, % breakout
df_tech['Primary_Division'].value_counts()/len(df_tech)*100

In [None]:
# Ensure all description columns are listed as null and not blank strings (this will impact column merging)
df_tech['Brief_Technology_Description'] = df_tech.Brief_Technology_Description.replace(r'^\s*$', np.nan, regex=True)
df_tech['Assessment_Description'] = df_tech.Assessment_Description.replace(r'^\s*$', np.nan, regex=True)
df_tech['Abstract'] = df_tech.Abstract.replace(r'^\s*$', np.nan, regex=True)

# Some of the descriptions include "See ...". We want to remove these, as they do not provide any insight
df_tech['Brief_Technology_Description'] = df_tech.Brief_Technology_Description.replace(r'^SEE .*', np.nan, regex=True)
df_tech['Brief_Technology_Description'] = df_tech.Brief_Technology_Description.replace(r'^See .*', np.nan, regex=True)

df_tech['Assessment_Description'] = df_tech.Assessment_Description.replace(r'^SEE .*', np.nan, regex=True)
df_tech['Assessment_Description'] = df_tech.Assessment_Description.replace(r'^See .*', np.nan, regex=True)

df_tech['Abstract'] = df_tech.Abstract.replace(r'^SEE .*', np.nan, regex=True)
df_tech['Abstract'] = df_tech.Abstract.replace(r'^See .*', np.nan, regex=True)

# Merge Abstract, Assessment_Description, and Brief_Technology_Description together to populate null values in each
# Brief_Technology_Description has the most information, so we will use this as the base column 
df_tech.Brief_Technology_Description = df_tech.Brief_Technology_Description.fillna(df_tech.Assessment_Description)
df_tech.Brief_Technology_Description = df_tech.Brief_Technology_Description.fillna(df_tech.Abstract)
df_tech.Brief_Technology_Description = df_tech.Brief_Technology_Description.fillna(df_tech.Title)

del df_tech['Assessment_Description']
del df_tech['Abstract']

# Confrirm we are not seeing any common issues in Breif_Technology_Description field (NA will be removed later)
df_tech.Brief_Technology_Description.value_counts().sort_values(ascending=False).nlargest(10)

### 1.2 Filtering

In [None]:
#Filter to patents that we are the leading institution
df_tech = df_tech.loc[df_tech['We_are_not_the_lead_institution']=='No']

# Generate lists for the will never be licensed, could be licensed, igone, and licensed statuses
licensed = ('Non-Exclusively Licensed', 'Exclusively Licensed', 'Optioned','Seeking Licensees', 'Post Election Hold', 'IP Authorized', 'Pending Title Election Decision')
never_licensed = ('Closed/Inactive', 'Waived Rights to Inventor', 'Awaiting Expiration','Licenses at Potential', 'IIA - Other Party Leads', 'Jointly Owned - UoC Leads', 'Combined with other Tech', 'Jointly Owned - Other Party Leads', 'Awaiting Info from Inventors', 'Negotiating License')

df_tech.loc[df_tech['Status'].isin(licensed), 'License_Status'] = 'license'
df_tech.loc[df_tech['Status'].isin(never_licensed), 'License_Status'] = 'no_license'

df_tech.License_Status.value_counts()

### 1.3 Drop Columns

In [None]:
columns_keep = ['Tech ID',
'Title',
'Lead_Inventor',
'Disclosure_Date',
'Division_Department',
'Owners',
'Ability_of_investigator_to_continue_research',
'Ability_to_advance_the_project_outside_the_lab',
'Abstract',
'Assessment_Description',
'Brief_Technology_Description',
'Compelling_nature_of_data',
'Detectability_of_infringement_and_enforceability',
'Development_and_regulatory_path_for_the_product',
'Freedom-to-operate_FTO_issues',
'Historical_cooperation_or_not_of_investigator',
'Identity_of_the_eventual_product',
'Impact_of_patent_on_adoption_of_technology',
'Industrial_startup_co-ownership_of_the_IP',
'Institution',
'Licensing_interest_by_a_specific_company',
'Market_feedback',
'Market_Size',
'Nature_of_improvement_over_existing_art',
'Patentability_questions',
'Risk_cost_sharing_w_other_institution',
'Size_of_Market',
'Stage_of_research',
'License_Status']

df_tech_keep = df_tech[['Tech_ID',
'Title',
'Lead_Inventor',
'Disclosure_Date',
'Division_Department',
'Primary_Division',
'Owners',
'Ability_of_investigator_to_continue_research',
'Ability_to_advance_the_project_outside_the_lab',
'Brief_Technology_Description',
'Compelling_nature_of_data',
'Detectability_of_infringement_and_enforceability',
'Development_and_regulatory_path_for_the_product',
'Freedom-to-operate_FTO_issues',
'Historical_cooperation_or_not_of_investigator',
'Identity_of_the_eventual_product',
'Impact_of_patent_on_adoption_of_technology',
'Industrial_startup_co-ownership_of_the_IP',
'Institution',
'Licensing_interest_by_a_specific_company',
'Market_feedback',
'Market_Size',
'Nature_of_improvement_over_existing_art',
'Patentability_questions',
'Risk_cost_sharing_w_other_institution',
'Size_of_Market',
'Stage_of_research',
'License_Status']]

df_tech_keep.rename(columns = {'Title':'Tech_Title'}, inplace = True)

df_tech_keep.shape

In [None]:
tech_missing = df_tech_keep.isnull().sum()

print(tech_missing)

## 2. Patent Dataset

In [None]:
df_pat = pd.DataFrame(pd.read_csv("patentData_Cleaned.csv"))
print("Patent Shape:", df_pat.shape)

In [None]:
df_pat.columns.to_list()

### 2.1 Feature Engineering

### Consolidate the "File_Date" and "Date_Actually_Filed" into a new column named "Actually_File_Date"

In [None]:
# We take "Date_Actually_Filed" as main column and combine "File_Date" which generate a new column named "Actually_File_Date"

df_pat["Actually_File_Date"] = df_pat["Date_Actually_Filed"].combine_first(df_pat["File_Date"])
df_pat.drop(["Date_Actually_Filed", "File_Date"], 1, inplace=True)

# The original columns "File_Date" has 36 missing values and "Date_Actually_Filed" has 1470 missing values
miss_num = df_pat["Actually_File_Date"].isnull().sum()
print("Actually_File_Date is missing:", miss_num)

In [None]:
# Remove the 35 null records in "Actually_File_Date"
df_pat = df_pat[~df_pat['Actually_File_Date'].isna()]

# Convert "object" data type to "datetime"
df_pat['Actually_File_Date'] = pd.to_datetime(df_pat['Actually_File_Date'].astype(str),format='%m/%d/%Y')

# Confirm null records have been removed 
df_pat["Actually_File_Date"].isnull().sum()

In [None]:
# Split "Inventors" names and count the Number of Inventors for each patent

# N is the number of inventors for each patent, the range of N is [1,19] and the average of N is 3. 
# For modeling purpose, we keep the first 5 inventors and split into multi-columns
n = 5  
inventor_names = [f'Inventors_{i}' for i in range(n)]
df_new = df_pat['Inventors'].map(lambda x:(str(x).split(','),len(str(x).split(',')))).apply(pd.Series)
df_inventor = df_new[0].apply(lambda x:x[:n]).apply(pd.Series)
df_inventor.columns=inventor_names
df_inventor.head(10)

In [None]:
# Create a column for counting the total number of inventors for each patent
df_cnt = pd.DataFrame(df_new[1])
df_cnt.columns=['Number_of_Inventors']
df_cnt.head(10)

In [None]:
# Combining 'Number_of_Inventors' to the patent dataset
df_pat = pd.concat([df_pat, df_cnt],axis=1).drop("Inventors", axis = 1)
df_pat.head(2)

In [None]:
# Remove the null records in "Patent_Status"
df_pat = df_pat[~df_pat['Status'].isna()]

# Confirm removal 
df_pat.Status.isnull().sum()

In [None]:
# Final column list of Patent data before feature selection
df_pat.columns.to_list()

### 2.2 Drop Columns for Patent Data

In [None]:
columns_keep_pat = ['Tech_ID',
'Title',
'Country_WIPO_ID',
'Actually_File_Date',
'Is_Priority',
'Lawfirm',
'Attorney',
'Number_of_Inventors',
'Application_Type'
]

df_pat_keep = df_pat[['Tech_ID',
'Title',
'Country_WIPO_ID',
'Actually_File_Date',
'Is_Priority',
'Lawfirm',
'Attorney',
'Number_of_Inventors',
'Application_Type']]

df_pat_keep.rename(columns = {'Title':'Patent_Title'}, inplace = True)

# Check missing values in columns we keep and impute any null values with "Others"
df_pat_keep.isnull().sum()

In [None]:
# Impute for columns with missing values 
df_pat_keep["Is_Priority"].fillna("Other", inplace = True)
df_pat_keep["Lawfirm"].fillna("Other", inplace = True)
df_pat_keep["Attorney"].fillna("Other", inplace = True)

df_pat_keep.isnull().sum()

In [None]:
# This is slightly higher than the count for patent modeling. This is because some records with a NA status in Patent_Status were removed
# We will not automatically remove these for the purposes of license modeling 
df_pat_keep.shape

### 3. Merge Tech and Patent datasets with columns_keep

In [None]:
# There are 5859 rows are matched with 41 columns ("Tech ID" will be dropped later)
df_modeling = df_pat_keep.join(
df_tech_keep.set_index(["Tech_ID"]),
on=["Tech_ID"],
how="inner",
lsuffix="_x",
rsuffix="_y")

df_modeling.shape

In [None]:
df_modeling.columns.to_list()

### 3.1 Duplicate Detection and consolidation to unique records only

In [None]:
print("Unique Technology titles in the Modeling Dataset:", df_modeling['Tech_Title'].nunique())

print("Unique Patent titles in the Merged Dataset:", df_modeling['Patent_Title'].nunique())

print("Shape of the Merged dataset:", df_modeling.shape)

In [None]:
# We see there are 4,768 records in the dataset that have the same Tech_Title/Patent_Title combination

duplicates = df_modeling[df_modeling.duplicated(subset=['Tech_Title','Patent_Title'], keep=False)]
duplicates.shape

In [None]:
duplicates.head()

In [None]:
# This is the list of the 10 technologies with the most assocaited rows in the dataset
df_modeling.Tech_Title.value_counts().sort_values(ascending=False).nlargest(10)

In [None]:
# Review of column with missing data
df_modeling.isnull().sum()

### Consolidate "Application Type" to determine a unique patent

In [None]:
# Sort values by Disclosure Date and Actually Filed Date
df_modeling = df_modeling.sort_values(by=['Disclosure_Date','Actually_File_Date'])
df_modeling.head(10)

In [None]:
# Asjust Patent_Title column to lowercase all values and remove extra whitespace to avoid duplicates 
# lowercase 
df_modeling['Patent_Title'] = df_modeling['Patent_Title'].str.lower()
# remove extra white space 
df_modeling['Patent_Title'] = df_modeling['Patent_Title'].str.strip()

# Group by Tech_Title, Patent_Title, Disclosure_Date, and Country_WIPO_ID. 
# These columns indicate a unique record for the purposes of modeling  
df_modeling = df_modeling.groupby(['Tech_Title','Disclosure_Date','Country_WIPO_ID'])
df_modeling.head(10)

In [None]:
# Take the first available Actually_Filed_Date from that unique entry
df_modeling = df_modeling.first().reset_index()
df_modeling.shape

In [None]:
# Drop one test row "TEST01" and then drop the "Tech_ID" column
df_modeling = df_modeling[df_modeling["Tech_ID"].str.contains("TEST01") == False]
df_modeling.shape

In [None]:
#df_modeling = df_modeling.drop(columns=['Tech_ID'])

In [None]:
# check the types of Application_Type values to determine if it looks like we are keeping to correct applications
df_modeling['Application_Type'].value_counts()

### 3.2 Incorporate LDA Topic modeling to add to columns 

In [None]:
# Prepare stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
# Convert Tech_Titles to list and tokenize
data = df_modeling.Tech_Title.values.tolist()

def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

In [None]:
# Develop LDA Topic Model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the keyword of topics
pprint(lda_model.print_topics())
# This applies the lda model to our corpus of titles, which we can use to assign a majority topic for each Tech_Topic 
doc_lda = lda_model[corpus]

In [None]:
# Compute Model Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

In [None]:
# Visualize the topics
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds')
vis

In [None]:
# get list of tuples containing topic percentages from each Tech_Title from doc_lda
topics = pd.DataFrame(doc_lda)

# Extract the 2nd element (the percentages for each given topic) from each column containing list element
topics['topic1'] = list(zip(*topics[0]))[0]
topics['topic2'] = list(zip(*topics[0]))[1]
topics['topic3'] = list(zip(*topics[0]))[2]
#topics['topic4'] = list(zip(*topics[0]))[3]

topics['topic1'] = list(zip(*topics['topic1']))[1]
topics['topic2'] = list(zip(*topics['topic2']))[1]
topics['topic3'] = list(zip(*topics['topic3']))[1]
#topics['topic4'] = list(zip(*topics['topic4']))[1]

# convert series objects to float for comparison 
topics['topic1'] = topics['topic1'].astype(str).astype(float)
topics['topic2'] = topics['topic2'].astype(str).astype(float)
topics['topic3'] = topics['topic3'].astype(str).astype(float)
#topics['topic4'] = topics['topic4'].astype(str).astype(float)

# create 'lda_topic' column with topic that carries majority weight for Tech_Title
topics.loc[(topics['topic1']>topics['topic2']) & (topics['topic1']>topics['topic3']), 'lda_topic'] = 1
topics.loc[(topics['topic2']>topics['topic1']) & (topics['topic2']>topics['topic3']), 'lda_topic'] = 2
topics.loc[(topics['topic3']>topics['topic1']) & (topics['topic3']>topics['topic2']), 'lda_topic'] = 3
#topics.loc[(topics['topic4']>topics['topic1']) & (topics['topic4']>topics['topic2']) & (topics['topic4']>topics['topic3']), 'lda_topic'] = 4

topics.drop(columns=[0,1,2],inplace = True)

# Check results 
topics.head(7)

### Topics are relatively evenly distributed across the 4 categories

In [None]:
topics['lda_topic'].value_counts()/len(topics)*100

In [None]:
#bind with main dataset
df_modeling = pd.concat([df_modeling, topics], axis=1)
df_modeling.head(7)

### More Feature Engineering

In [None]:
df_modeling['Primary_Division'].value_counts()

In [None]:
#Since MBL does not have many patent applications, combine it into 'Other'
df_modeling.loc[df_modeling['Primary_Division']=='MBL','Primary_Division'] = 'Other'

df_modeling['Primary_Division'].value_counts()

In [None]:
#Create licence counts by primary division
num_licenses_division = df_modeling['Primary_Division'].value_counts().reindex(
    df_modeling.Primary_Division.unique(), fill_value=0)
num_success_licenses_division = df_modeling.loc[df_modeling['License_Status']=='license']['Primary_Division'].value_counts().reindex(
    df_modeling.Primary_Division.unique(), fill_value=0)
licenses_by_division = pd.DataFrame({'Primary_Division':num_licenses_division.index, 'Licenses_in_Division':num_licenses_division.values, 'Successful Licenses_in_Division':num_success_licenses_division.values})

licenses_by_division

In [None]:
#Create patent success rate by primary division and drop NaN row
licenses_by_division['Division_License_Success_Rate'] = licenses_by_division['Successful Licenses_in_Division']/licenses_by_division['Licenses_in_Division']
licenses_by_division = licenses_by_division.drop([4])
licenses_by_division

In [None]:
df_modeling = df_modeling.merge(licenses_by_division)

In [None]:
#Create patent counts by tech family
num_licenses_tech = df_modeling['Tech_Title'].value_counts()
licenses_by_tech = pd.DataFrame({'Tech_Title':num_licenses_tech.index, 'Licenses_in_Tech':num_licenses_tech.values})
licenses_by_tech

In [None]:
df_modeling = df_modeling.merge(licenses_by_tech)
df_modeling.head(10)

In [None]:
# Get time delta between disclosure date and date actually filed 
df_modeling['Actually_File_Date'] = pd.to_datetime(df_modeling['Actually_File_Date'])
df_modeling['Disclosure_Date'] = pd.to_datetime(df_modeling['Disclosure_Date'])

df_modeling['Disclosure_to_Filing'] = (df_modeling['Actually_File_Date'] - df_modeling['Disclosure_Date']).astype('timedelta64[D]')

### Imputation of null values not subject to KNN Imputation

In [None]:
df_modeling.isnull().sum()

In [None]:
# Drop row where topics are null 
df_modeling = df_modeling.dropna(subset=['topic1'])
df_modeling = df_modeling.dropna(subset=['topic2'])
df_modeling = df_modeling.dropna(subset=['topic3'])

# Convert Owners with null value to "not listed"
df_modeling.Owners = df_modeling.Owners.fillna('Not_Listed')

# Drop Brief_Assessment_Description, Patent_Title, and Tech_Title, as they are description fields and will not add value for our modeling 
df_modeling.drop('Brief_Technology_Description', axis=1, inplace=True)
df_modeling.drop('Tech_Title', axis=1, inplace=True)
df_modeling.drop('Patent_Title', axis=1, inplace=True)

# Convert Institution with null value to "Other"
df_modeling.Institution = df_modeling.Institution.fillna('Other')

# Market_Size has too many unique values, so we will remove 
df_modeling.drop('Market_Size', axis=1, inplace=True)

# Split out dataset containing records with Disclosure date beyond 2012
df_modeling_2012 = df_modeling[(df_modeling['Actually_File_Date']>pd.Timestamp(2012,1,1))]  #last 10-year records

# Save Patent Status and remove from dataset for now (we will add back in after scaling)
license_status = pd.DataFrame(df_modeling, columns=['License_Status']) 
df_modeling.drop('License_Status', axis=1, inplace=True)

license_status_2012 = pd.DataFrame(df_modeling_2012, columns=['License_Status'])
license_status_2012 = license_status_2012.reset_index(drop=True)
df_modeling_2012.drop('License_Status', axis=1, inplace=True)

### Create copy of datasets prior to scaling in order to merge actual values with modeling predictions

In [None]:
print('Size of total dataset: ', len(df_modeling))
print('Size of 2012 dataset: ', len(df_modeling_2012))

In [None]:
license_df_cleaned = df_modeling.copy()
license_df_cleaned_2012 = df_modeling_2012.copy()

print('Size of total dataset: ', len(license_df_cleaned))
print('Size of 2012 dataset: ', len(license_df_cleaned_2012))

In [None]:
# drop Tech_ID
df_modeling = df_modeling.drop(columns=['Tech_ID'])
df_modeling_2012 = df_modeling_2012.drop(columns=['Tech_ID'])

### Convert categorical variables using integer encoding 

In [None]:
# date fields
df_modeling['Disclosure_Date'] = df_modeling['Disclosure_Date'].values.astype(float)
df_modeling['Actually_File_Date'] = df_modeling['Actually_File_Date'].values.astype(float)
df_modeling['Country_WIPO_ID'] =df_modeling['Country_WIPO_ID'].astype('category').cat.codes
df_modeling['Is_Priority'] =df_modeling['Is_Priority'].astype('category').cat.codes
df_modeling['Lawfirm'] =df_modeling['Lawfirm'].astype('category').cat.codes
df_modeling['Attorney'] =df_modeling['Attorney'].astype('category').cat.codes
df_modeling['Application_Type'] =df_modeling['Application_Type'].astype('category').cat.codes
df_modeling['Lead_Inventor'] =df_modeling['Lead_Inventor'].astype('category').cat.codes
df_modeling['Division_Department'] =df_modeling['Division_Department'].astype('category').cat.codes
df_modeling['Primary_Division'] =df_modeling['Primary_Division'].astype('category').cat.codes
df_modeling['Owners'] =df_modeling['Owners'].astype('category').cat.codes
df_modeling['Ability_of_investigator_to_continue_research'] =df_modeling['Ability_of_investigator_to_continue_research'].astype('category').cat.codes
df_modeling['Ability_to_advance_the_project_outside_the_lab'] =df_modeling['Ability_to_advance_the_project_outside_the_lab'].astype('category').cat.codes
df_modeling['Compelling_nature_of_data'] =df_modeling['Compelling_nature_of_data'].astype('category').cat.codes
df_modeling['Detectability_of_infringement_and_enforceability'] =df_modeling['Detectability_of_infringement_and_enforceability'].astype('category').cat.codes
df_modeling['Development_and_regulatory_path_for_the_product'] =df_modeling['Development_and_regulatory_path_for_the_product'].astype('category').cat.codes
df_modeling['Freedom-to-operate_FTO_issues'] =df_modeling['Freedom-to-operate_FTO_issues'].astype('category').cat.codes
df_modeling['Historical_cooperation_or_not_of_investigator'] =df_modeling['Historical_cooperation_or_not_of_investigator'].astype('category').cat.codes
df_modeling['Identity_of_the_eventual_product'] =df_modeling['Identity_of_the_eventual_product'].astype('category').cat.codes
df_modeling['Impact_of_patent_on_adoption_of_technology'] =df_modeling['Impact_of_patent_on_adoption_of_technology'].astype('category').cat.codes
df_modeling['Industrial_startup_co-ownership_of_the_IP'] =df_modeling['Industrial_startup_co-ownership_of_the_IP'].astype('category').cat.codes
df_modeling['Institution'] =df_modeling['Institution'].astype('category').cat.codes
df_modeling['Licensing_interest_by_a_specific_company'] =df_modeling['Licensing_interest_by_a_specific_company'].astype('category').cat.codes
df_modeling['Market_feedback'] =df_modeling['Market_feedback'].astype('category').cat.codes
df_modeling['Nature_of_improvement_over_existing_art'] =df_modeling['Nature_of_improvement_over_existing_art'].astype('category').cat.codes
df_modeling['Patentability_questions'] =df_modeling['Patentability_questions'].astype('category').cat.codes
df_modeling['Risk_cost_sharing_w_other_institution'] =df_modeling['Risk_cost_sharing_w_other_institution'].astype('category').cat.codes
df_modeling['Size_of_Market'] =df_modeling['Size_of_Market'].astype('category').cat.codes
df_modeling['Stage_of_research'] =df_modeling['Stage_of_research'].astype('category').cat.codes

# date fields
df_modeling_2012['Disclosure_Date'] = df_modeling_2012['Disclosure_Date'].values.astype(float)
df_modeling_2012['Actually_File_Date'] = df_modeling_2012['Actually_File_Date'].values.astype(float)
df_modeling_2012['Country_WIPO_ID'] =df_modeling_2012['Country_WIPO_ID'].astype('category').cat.codes
df_modeling_2012['Is_Priority'] =df_modeling_2012['Is_Priority'].astype('category').cat.codes
df_modeling_2012['Lawfirm'] =df_modeling_2012['Lawfirm'].astype('category').cat.codes
df_modeling_2012['Attorney'] =df_modeling_2012['Attorney'].astype('category').cat.codes
df_modeling_2012['Application_Type'] =df_modeling_2012['Application_Type'].astype('category').cat.codes
df_modeling_2012['Lead_Inventor'] =df_modeling_2012['Lead_Inventor'].astype('category').cat.codes
df_modeling_2012['Division_Department'] =df_modeling_2012['Division_Department'].astype('category').cat.codes
df_modeling_2012['Primary_Division'] =df_modeling_2012['Primary_Division'].astype('category').cat.codes
df_modeling_2012['Owners'] =df_modeling_2012['Owners'].astype('category').cat.codes
df_modeling_2012['Ability_of_investigator_to_continue_research'] =df_modeling_2012['Ability_of_investigator_to_continue_research'].astype('category').cat.codes
df_modeling_2012['Ability_to_advance_the_project_outside_the_lab'] =df_modeling_2012['Ability_to_advance_the_project_outside_the_lab'].astype('category').cat.codes
df_modeling_2012['Compelling_nature_of_data'] =df_modeling_2012['Compelling_nature_of_data'].astype('category').cat.codes
df_modeling_2012['Detectability_of_infringement_and_enforceability'] =df_modeling_2012['Detectability_of_infringement_and_enforceability'].astype('category').cat.codes
df_modeling_2012['Development_and_regulatory_path_for_the_product'] =df_modeling_2012['Development_and_regulatory_path_for_the_product'].astype('category').cat.codes
df_modeling_2012['Freedom-to-operate_FTO_issues'] =df_modeling_2012['Freedom-to-operate_FTO_issues'].astype('category').cat.codes
df_modeling_2012['Historical_cooperation_or_not_of_investigator'] =df_modeling_2012['Historical_cooperation_or_not_of_investigator'].astype('category').cat.codes
df_modeling_2012['Identity_of_the_eventual_product'] =df_modeling_2012['Identity_of_the_eventual_product'].astype('category').cat.codes
df_modeling_2012['Impact_of_patent_on_adoption_of_technology'] =df_modeling_2012['Impact_of_patent_on_adoption_of_technology'].astype('category').cat.codes
df_modeling_2012['Industrial_startup_co-ownership_of_the_IP'] =df_modeling_2012['Industrial_startup_co-ownership_of_the_IP'].astype('category').cat.codes
df_modeling_2012['Institution'] =df_modeling_2012['Institution'].astype('category').cat.codes
df_modeling_2012['Licensing_interest_by_a_specific_company'] =df_modeling_2012['Licensing_interest_by_a_specific_company'].astype('category').cat.codes
df_modeling_2012['Market_feedback'] =df_modeling_2012['Market_feedback'].astype('category').cat.codes
df_modeling_2012['Nature_of_improvement_over_existing_art'] =df_modeling_2012['Nature_of_improvement_over_existing_art'].astype('category').cat.codes
df_modeling_2012['Patentability_questions'] =df_modeling_2012['Patentability_questions'].astype('category').cat.codes
df_modeling_2012['Risk_cost_sharing_w_other_institution'] =df_modeling_2012['Risk_cost_sharing_w_other_institution'].astype('category').cat.codes
df_modeling_2012['Size_of_Market'] =df_modeling_2012['Size_of_Market'].astype('category').cat.codes
df_modeling_2012['Stage_of_research'] =df_modeling_2012['Stage_of_research'].astype('category').cat.codes

### Scale data

In [None]:
df_modeling.head()

In [None]:
scaler = MinMaxScaler()
df_modeling = pd.DataFrame(scaler.fit_transform(df_modeling), columns = df_modeling.columns)
df_modeling_2012 = pd.DataFrame(scaler.fit_transform(df_modeling_2012), columns = df_modeling_2012.columns)
df_modeling.head()

### Conduct KNN Imputation

Resources: https://medium.com/@kyawsawhtoon/a-guide-to-knn-imputation-95e2dc496e

In [None]:
imputer = KNNImputer(n_neighbors=5)
df_modeling = pd.DataFrame(imputer.fit_transform(df_modeling),columns = df_modeling.columns)
df_modeling_2012 = pd.DataFrame(imputer.fit_transform(df_modeling_2012),columns = df_modeling_2012.columns)

In [None]:
df_modeling.isnull().sum()

In [None]:
# Merge patent status back onto modeling dataset 
df_modeling = df_modeling.join(license_status)
df_modeling_2012 = df_modeling_2012.join(license_status_2012)
df_modeling_2012.head()

### 6. Review Final Patent Status Breakout and length of Dataset 

In [None]:
print(df_modeling_2012['License_Status'].value_counts()/len(df_modeling_2012)*100)

print('\nSize of total dataset: ', len(license_df_cleaned))
print('Size of 2012 dataset: ', len(license_df_cleaned_2012))

In [None]:
df = df_modeling_2012.copy()

# drop unamed column 
df = df.drop(df.columns[0], axis=1)

# Remove row with null patent_status
df = df.dropna(subset=['License_Status'])

# convert funding status to binary 
df.loc[df['License_Status']=='no_license', 'License_Status'] = 0
df.loc[df['License_Status']=='license', 'License_Status'] = 1
df["License_Status"] = df.License_Status.astype(float)

df.head()

In [None]:
X = df.iloc[:, 0:38]
y = df.iloc[:, 38]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# check split of data
len(x_train), len(y_train), len(x_test), len(y_test), len(df)

In [None]:
# Employment of SMOTE to oversample skewed funding outputs. This yields better results 
sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train, y_train)

# check shape of x_train and y_train, and new response variable ratio
print(x_train.shape)
print(y_train.shape)
print(y_train.value_counts(normalize = True))

### 6.1 Binary Classification Modeling Pipeline 

In [None]:
pipelines = []
# Standard binary classification models 
pipelines.append(('LogisticRegression', Pipeline([('LR',linear_model.LogisticRegression())])))
pipelines.append(('KNearestNeighbors', Pipeline([('KNN',KNeighborsClassifier())])))
pipelines.append(('LinearSVC', Pipeline([('SVC',LinearSVC())])))
pipelines.append(('DecisionTree', Pipeline([('DTREE',DecisionTreeClassifier())])))
# Employment of ensemble learning 
pipelines.append(('BaggingClassifier', Pipeline([('BAG',BaggingClassifier())])))
pipelines.append(('BoostClassifier', Pipeline([('BOOST',AdaBoostClassifier())])))
pipelines.append(('RandomForest', Pipeline([('FOREST',RandomForestClassifier())])))
pipelines.append(('GradientBoost', Pipeline([('GBoost',GradientBoostingClassifier())])))
pipelines.append(('XGBoosting', Pipeline([('XGBoost',XGBClassifier(objective='binary:logistic', eval_metric='error'))])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='f1')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

### 6.2 Model Tuning 

In [None]:
# ROC Curve outputs 
def roc_curve(model):
    probs = model.predict_proba(x_test)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
    roc_auc = metrics.auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='GridSearchCV (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    

# Model fitting outputs 
def fitting_score(train_predictions, test_predictions, model):
    # Evaluate the train, test and CV metrics
    # train
    train_recall = metrics.recall_score(y_train, train_predictions, average='macro')
    train_precision = metrics.precision_score(y_train, train_predictions, average='macro')
    train_f1 = metrics.f1_score(y_train, train_predictions, average='macro')

    # test
    test_recall = metrics.recall_score(y_test, test_predictions, average='macro')
    test_precision = metrics.precision_score(y_test, test_predictions, average='macro')
    test_f1 = metrics.f1_score(y_test, test_predictions, average='macro')

    # CV
    cv_recall = cross_val_score(model, x_train, y_train, cv=5, scoring='recall', n_jobs=-1).mean()
    cv_precision = cross_val_score(model, x_train, y_train, cv=5, scoring='precision', n_jobs=-1).mean()
    cv_f1 = cross_val_score(model, x_train, y_train, cv=5, scoring='f1', n_jobs=-1).mean()
    
    # Put everything in a table
    d = {'train': [train_recall, train_precision, train_f1], 'test': [test_recall, test_precision, test_f1],
        'cv':[cv_recall, cv_precision, cv_f1]}
    score_df = pd.DataFrame(data=d, index=['recall', 'precision', 'f1'])
    return score_df.round(decimals=3)

def pr_graph(model):
    # predict probabilities
    lr_probs = model.predict_proba(x_test)
    # keep probabilities for the positive outcome only
    lr_probs = lr_probs[:, 1]
    # predict class values
    yhat = model.predict(x_test)
    lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
    lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)

    # plot the precision-recall curves
    no_skill = len(y_test[y_test==1]) / len(y_test)
    pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
    # axis labels
    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')
    # show the legend
    pyplot.legend()
    # show the plot
    pyplot.show()

In [None]:
xgb = XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='error', gamma=5,
                   colsample_bytree=0.35, learning_rate=0.1, n_estimators=200, reg_lambda=1, max_depth=15,
                   min_child_weight=0, reg_alpha=1, scale_pos_weight=2, subsample=0.9)

xgb.fit(x_train, y_train)
xgb_predictions = xgb.predict(x_test)
xgb_predictions_train = xgb.predict(x_train)

xgb_matrix = metrics.confusion_matrix(y_test,xgb_predictions)

plot_confusion_matrix(xgb, x_test, y_test)  
plt.show()

class_report_xgb = classification_report(y_test, xgb_predictions)
print("\nGradient Boosting Classifier Confusion Matrix\n",class_report_xgb)

print("\nAccuracy:",round(metrics.accuracy_score(y_test,xgb_predictions),3))

print('\n Gradient Boosting Classification Train/Test/CV Scoring')
fitting_score(xgb_predictions_train, xgb_predictions, xgb)