In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
# Read data
data = pd.read_csv('../Data/Assignment_data/merge_data.csv')

In [3]:
# Simple presentation
data.head(2)

Unnamed: 0,rf_id,cname,convey_text,ee_name,or_name,convey_ty,employer_assign
0,12800340,THOMAS J. ENGELLENNER,ASSIGNMENT OF ASSIGNORS INTEREST (SEE DOCUMENT...,CHILDREN'S MEDICAL CENTER CORPORATION,"ATALA, ANTHONY",assignment,1
1,36250888,NORTHERN TELECOM LIMITED,CHANGE OF NAME (SEE DOCUMENT FOR DETAILS).,NORTHERN TELECOM LIMITED,NORTHERN ELECTRIC COMPANY LIMITED,namechg,0


In [6]:
# Read data
data2 = pd.read_csv('../Data/Assignment_data/patent_info.csv')

In [7]:
data2

Unnamed: 0,rf_id,title,appno_doc_num,pgpub_doc_num,patent_id,section,ipc_class,subclass,main_group
0,40070763,BACKLIGHT MODULE CONTROL SYSTEM WHOSE TWO BACK...,12345689,2.010001e+10,8004205,H,H05,H05B,H05B41
1,42570243,TOUCH DISPLAY PANEL,12345677,2.010011e+10,8134537,G,G06,G06F,G06F3
2,42610879,Gas chromatograph having a radiant oven for an...,11111111,2.006024e+10,7130534,G,G01,G01N,G01N30
3,42610879,Gas chromatograph having a radiant oven for an...,11111111,2.006024e+10,7130534,H,H05,H05B,H05B3
4,72450498,DEVICE FOR THE PURIFICATION OF EXHAUST GAS,8352079,2.003010e+10,7442346,F,F01,F01N,F01N3
...,...,...,...,...,...,...,...,...,...
17895004,626470342,SYSTEMS AND METHODS FOR SELF-SUPERVISED RESIDU...,16892885,2.021038e+10,11551363,G,G1,G1C,G1C21
17895005,626470342,SYSTEMS AND METHODS FOR SELF-SUPERVISED RESIDU...,16892885,2.021038e+10,11551363,G,G1,G1S,G1S17
17895006,626470372,RISK PREDICTION ON A PEER-TO-PEER NETWORK,17220726,2.021030e+10,11529966,B,B60,B60W,B60W50
17895007,626470372,RISK PREDICTION ON A PEER-TO-PEER NETWORK,17220726,2.021030e+10,11529966,B,B60,B60W,B60W40


In [5]:
# Obtain the number of types of 'convey_ty' and 'convey_text'
# Retrieve all unique values from the 'convery_ty' column
unique_convey_ty = data['convey_ty'].unique()
# Retrieve all unique values from the 'convery_text' column
unique_convey_text = data['convey_text'].unique()
print('Types of convey_ty:',len(unique_convey_ty))
print('Types of convey_text:',len(unique_convey_text))

Types of convey_ty: 10
Types of convey_text: 223017


In [6]:
# Read stop word file
with open('Stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(line.strip() for line in file)

def preprocess_text(text):
    if isinstance(text, str):  # Check if it is a string
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation marks and numbers
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        # Text segmentation
        words = text.split()
        # Remove stop words
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        # If it is not a string, return an empty string or other default value
        return ''

# Apply preprocessing while processing NaN or non string values
data['convey_text'] = data['convey_text'].apply(preprocess_text)

In [7]:
# Extract data related to 'patent transactions' or 'joint research'

In [8]:
# Define keywords for satisfaction and exclusion items
positive_keywords = ['assignment', 'interest', 'right', 'joint']
negative_keywords = [
'legal', 'legally', 'legality', 'illegality',
'inheritance', 'inherited', 'inheriting', 'inherits', 'heir', 'heirs',
'spin', 'spun', 'spins', 'spinning',
'cross referencing', 'cross-referenced', 'cross-reference', 'cross references', 'cross-referencing',
'invalid', 'invalidity', 'invalidate', 'invalidated', 'invalidating', 'unvalid', 'non-valid', 'not valid',
'change', 'changed', 'changing', 'changes', 'alteration', 'alter', 'altered', 'alternating', 'modify', 'modified', 'modifying', 'modification',
'rate', 'rated', 'rates', 'rating', 'rank', 'ranked', 'ranking', 'ranks',
'stock', 'stocks', 'share', 'shares', 'equity', 'equities',
'merge','merger','merged','merging'
'corrective','correct','corrected',
'release','released','missing','miss',
'employee','employment'
]

In [9]:
# Retain original data
data_1 = data.copy()

# Filter function
def filter_convey_text(text):
    # If the text is empty or not of string type, return False
    if not isinstance(text, str) or pd.isna(text):
        return False
    
    # Convert text to lowercase to ensure case insensitive matching
    text_lower = text.lower()
    
    # Check if any 'satisfaction item' keywords are included
    if not any(keyword in text_lower for keyword in positive_keywords):
        return False
    
    # Check if any 'exclusion item' keywords are included
    if any(keyword in text_lower for keyword in negative_keywords):
        return False
    
    return True

# Apply filtering function
data = data[data['convey_text'].apply(filter_convey_text)]

In [10]:
# Statistical screening results
print('Initial data volume:',len(data_1))
print('Filtered data volume:',len(data))
print('Excluding a total of',len(data_1)-len(data),'rows of data')

Initial data volume: 10046764
Filtered data volume: 9310652
Excluding a total of 736112 rows of data


In [11]:
# Statistical screening results
print('Types of convey texts for initial data',len(data_1['convey_text'].unique()))
print('Types of convey texts for filtered data:',len(data['convey_text'].unique()))
print('Excluding a total of',len(data_1['convey_text'].unique())-len(data['convey_text'].unique()),'types of convey texts')

Types of convey texts for initial data 107539
Types of convey texts for filtered data: 5596
Excluding a total of 101943 types of convey texts


In [12]:
# Filter specified convey type as'assignment'
data1 = data[data['convey_ty'] == 'assignment']
# Reset index
data1 = data1.reset_index(drop=True)

In [None]:
# Filter specified convey type as'govern'
data2 = data[data['convey_ty'] == 'govern']
# Reset index
data2 = data2.reset_index(drop=True)

In [31]:
# Filter specified convey type as'govern'
data3 = data[data['convey_ty'] == 'other']
# Reset index
data3 = data3.reset_index(drop=True)

In [15]:
# Government assignment data
govern1_data = data2[data2['convey_text'].str.contains('assigns', case=False)]
govern2_data = data2[data2['convey_text'].str.contains('interest', case=False)]
govern3_data = data2[data2['convey_text'].str.contains('assignment', case=False)]
govern_data = pd.concat([govern1_data,govern2_data,govern3_data], axis=0)
# Reset index
govern_data = govern_data.reset_index(drop=True)

In [35]:
# Joint research data
joint1_data = data1[data1['convey_text'].str.contains('joint', case=False)]
joint2_data = data2[data2['convey_text'].str.contains('joint', case=False)]
joint3_data = data3[data3['convey_text'].str.contains('joint', case=False)]
joint_data = pd.concat([joint1_data,joint2_data,joint3_data],axis=0)

In [37]:
# Assignment data of ordinary patent rights
unique_data1 = data1[['convey_text']].drop_duplicates().values.tolist()
print('The number of convey texts for assigment data of ordinary patent rights is:',len(unique_data1))

The number of convey texts for assigment data of ordinary patent rights is: 3281


In [38]:
# Count the number of patents in each convey texts
counts = data1['convey_text'].value_counts()
counts = pd.DataFrame(counts)

In [39]:
counts.head(10)

Unnamed: 0_level_0,count
convey_text,Unnamed: 1_level_1
assignment assignors document details,8389345
assignment assignors,880092
nunc pro tunc assignment document details,17770
assignment,852
conditional assignment document details,652
confirmation assignment,552
assignment rights barbados,367
nunc pro tunc assignment document details effective,330
partial assignment,328
assignment undivided,279


In [42]:
# Data merging
assignment = pd.concat([joint_data,govern_data,data1],axis=0)
# Duplicate removal
assignment = assignment.drop_duplicates()
# Reset index
assignment = assignment.reset_index(drop=True)

In [44]:
assignment.head(10)

Unnamed: 0,rf_id,cname,convey_text,ee_name,or_name,convey_ty,employer_assign
0,38070722,"WATSON, COLE, GRINDLE & WATSON",assigor assigns joint inventon assignee,"SUMITOMO METAL INDUSTRIES LIMITED, 15, 5-CHOME...",SUMIKIN COKE COMPANY LIMITED,assignment,0
1,38710536,ROGER B. WEBSTER,assigns himselfrene campbell susan campbell wi...,"CAMPBELL, RENE M., JOINT TENANT IN COMMON; CAM...",CAMPBELL RENE M.,assignment,1
2,38790472,KARL W. FLOCKS,assigns undivided joint title,"JANSKY, JOHN DR.",STOFKO JOHN,assignment,1
3,39050170,BAUER & AMER,assigns jointly assignees subject conditions r...,"BAUER, JEROME; AMER, MYRON","CANU, JOHN C.",assignment,0
4,39070478,STEPHEN D. CARVER,paul virginia henry joint tenants,"HENRY, PAUL AND VIRGINIA","MOORE, COVIE R.; ALTOM, LARRY",assignment,0
5,39700920,"REISING, ETHINGTON, BARNARD, PERRY &",assign husband wife joint tenants undivided on...,"SEALS, RALPH W. AND SEALS, KATHERINE; GRAHAM, ...","SEALS, RALPH W.",assignment,1
6,39720501,KARL W. FLOCKS AND ASSOCIATES,assigns undivided joint title serial,"JANSKY, JOHN","STOFKO, JOHN",assignment,1
7,40230020,"GUST, IRISH, JEFFERS ET AL",assigns assignee onehalf joint tenants tenants...,"GAWTHROP, ROGER L., R.R. 4, ALBION,","GAWTHROP, ROGER L. ADMINISTRATOR WITH WILL ANN...",assignment,0
8,40250624,STANDARD OIL CO. (IND),assigns jointly equally tenants commonthe entire,"STANDARD OIL COMPANY CHICAGO,IL. A CORP.OF; GU...","KNEPPER, JAY C.",assignment,1
9,40310841,U.S.PHILIPS CORP.,assigns entire jointly tenants commonthe entire,"MAGNAVO COMPANY THE, A CORP OF DE; SANDERS ASS...","APF ELECTRONICS,INC.",assignment,0


In [45]:
assignment.to_csv('../Data/Assignment_data/assignment.csv', index=False)