In [1]:
import pandas as pd
import numpy as np
import os
import gzip 
import pickle

In [2]:
# possible files with saved results
a = '/nobackup1/lraymond/patent_data/from_local/raw_patent_nums_plus_claims.gzip'
b = '/nobackup1/lraymond/patent_data/from_local/small_sample_raw_patent_nums_plus_claims.gzip'


In [3]:
def load_zipped_pickle(filename):
    # load a zipped compressed pickle file
    with gzip.open(filename, 'rb') as f:
        loaded_object = pickle.load(f)
        return loaded_object
    
def save_zipped_pickle(obj, filename, protocol=-1):
    # this will default to the highest protocol
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol)

In [4]:
# other references colymns need to be combined
potential = '/pool001/lraymond/patent_data/index_files/pat_nums_plus_claims.gzip'

df2 = load_zipped_pickle(potential)

df2 = df2.sort_values('patent_number')

In [5]:
df2['patent_number'].dtype

dtype('int64')

In [7]:
df2[[
    'min_reference_sequence', 'max_reference_sequence']] = df2[[
    'min_reference_sequence', 'max_reference_sequence']].fillna(0)

In [9]:
refs = df2[['patent_number', 'flag_has_references', 'max_reference_sequence', 
             'min_reference_sequence', 'number_references_found', 'number_journal_references',
            'reference_source_filename']].sort_values('patent_number').drop_duplicates()

In [10]:
mask= refs.duplicated(subset=['patent_number',
            'reference_source_filename'], keep=False)
refs[mask]

Unnamed: 0,patent_number,flag_has_references,max_reference_sequence,min_reference_sequence,number_references_found,number_journal_references,reference_source_filename


In [11]:
refs_grouped = refs.groupby('patent_number').agg({
    'flag_has_references': np.max, 'max_reference_sequence': np.max,
    'min_reference_sequence': np.min, 'number_references_found': np.sum, 
'number_journal_references': np.sum})

In [12]:
refs_files= refs.loc[refs.number_references_found>0,:].groupby('patent_number')['reference_source_filename'].apply(
    lambda x: ' ; '.join(x.values))

In [13]:
refs_info = refs_grouped.join(refs_files)

In [14]:
print(refs_grouped.shape)
print(refs_grouped.loc[refs_grouped.flag_has_references>0,:].shape)
print(refs_files.shape)
print(refs_info.shape)

(199999, 5)
(55198, 5)
(55198,)
(199999, 6)


In [15]:
refs_info[['flag_has_references', 'max_reference_sequence',
       'min_reference_sequence', 'number_references_found', 
          'number_journal_references']] = refs_info[['flag_has_references', 'max_reference_sequence',
       'min_reference_sequence', 'number_references_found', 
                                                    'number_journal_references']].fillna(0)

In [16]:
descr = df2[['patent_number', 'flag_has_description_text','description_word_count',
         'number_description_found', 'description_source_filename']].sort_values('patent_number').drop_duplicates()


In [17]:
mask= descr.duplicated(subset=['patent_number',
            'description_source_filename'], keep=False)
descr[mask]

Unnamed: 0,patent_number,flag_has_description_text,description_word_count,number_description_found,description_source_filename


In [18]:
descr_grouped = descr.sort_values('patent_number').groupby('patent_number').agg({
    'description_word_count': np.max, 'flag_has_description_text': np.max,
    'number_description_found': np.sum
})

In [19]:
descr_files= descr.loc[descr.number_description_found>0,:].groupby('patent_number')['description_source_filename'].apply(
    lambda x: ' ; '.join(x.values))

In [20]:
descr_info = descr_grouped.join(descr_files)

In [21]:
descr_info[['description_word_count', 'flag_has_description_text',
       'number_description_found']] = descr_info[['description_word_count', 'flag_has_description_text',
       'number_description_found']].fillna(0)

In [22]:
df3 = df2[['patent_year', 'patent_number', '10_year_cites_top1', '10_year_cites',
       'is_valid_patent_number', 'max_independent_claim',
       'max_dependent_claim', 'number_independent_claims_found',
       'number_dependent_claims_found', 'indep_claims_filenames',
       'flag_has_dependent_claim_text', 'flag_has_independent_claim_text']].sort_values(
            'patent_number').drop_duplicates()

In [23]:
print(df2.shape)
print(df3.shape)
print(df3.drop_duplicates('patent_number').shape)
df3 = df3.set_index('patent_number')
#descr_info = descr_info.set_index('patent_number')
#refs_info = refs_info.set_index('patent_number')

(426465, 22)
(199999, 12)
(199999, 12)


In [24]:
df4 = df3.join([descr_info, refs_info], how='left')

In [25]:
print(df4.shape)

(199999, 21)


In [26]:
np.sum(pd.isnull(df4))

patent_year                             0
10_year_cites_top1                      0
10_year_cites                           0
is_valid_patent_number                  0
max_independent_claim                   0
max_dependent_claim                     0
number_independent_claims_found         0
number_dependent_claims_found           0
indep_claims_filenames                 22
flag_has_dependent_claim_text           0
flag_has_independent_claim_text         0
description_word_count                  0
flag_has_description_text               0
number_description_found                0
description_source_filename         35491
flag_has_references                     0
max_reference_sequence                  0
min_reference_sequence                  0
number_references_found                 0
number_journal_references               0
reference_source_filename          144801
dtype: int64

In [41]:
df5 = df4.reset_index(drop=False)

In [42]:
def is_valid_patent_number(patent_num):
    '''try to convert patent number to int, flag errors'''
    try:
        t = int(patent_num)
        return 1
    except (TypeError, ValueError):
        return 0

In [43]:
del df5['is_valid_patent_number']
df5['is_valid_patent_number'] = df5.patent_number.apply(is_valid_patent_number)

In [45]:
save_zipped_pickle(
    df5, '/pool001/lraymond/patent_data/index_files/pat_nums_processing_results.gzip')

del df3, df2 

NameError: name 'df3' is not defined

In [None]:
 df5 = load_zipped_pickle(
    '/pool001/lraymond/patent_data/index_files/pat_nums_processing_results.gzip')

In [48]:
# df5.dtypes
np.sum(pd.isnull(df5))

patent_number                           0
patent_year                             0
10_year_cites_top1                      0
10_year_cites                           0
max_independent_claim                   0
max_dependent_claim                     0
number_independent_claims_found         0
number_dependent_claims_found           0
indep_claims_filenames                 22
flag_has_dependent_claim_text           0
flag_has_independent_claim_text         0
description_word_count                  0
flag_has_description_text               0
number_description_found                0
description_source_filename         35491
flag_has_references                     0
max_reference_sequence                  0
min_reference_sequence                  0
number_references_found                 0
number_journal_references               0
reference_source_filename          144801
is_valid_patent_number                  0
dtype: int64

In [49]:
index_file = '/pool001/lraymond/patent_data/index_files/pat_nums_index.csv'

pats_index = pd.read_csv(index_file)
pats_index = pats_index.sort_values('patent_number')

  interactivity=interactivity, compiler=compiler, result=result)


In [55]:
pats_index[['10_year_cites_rank',
'15_year_cites_rank',
'20_year_cites_rank',        
'30_year_cites_rank',            
'5_year_cites_rank']] = pats_index[['10_year_cites_rank',
'15_year_cites_rank',
'20_year_cites_rank',        
'30_year_cites_rank',            
'5_year_cites_rank']].fillna(0)

In [33]:
print(pats_index.drop_duplicates('patent_number').shape)
print(pats_index.drop_duplicates().shape)

(1994958, 59)
(1994958, 59)


In [56]:
# check for null values
np.sum(pd.isnull(pats_index))

10_year_cites                                 0
15_year_cites                                 0
20_year_cites                                 0
30_year_cites                                 0
5_year_cites                                  0
assignee_is_company                           0
assignee_is_gov                               0
assignee_is_ind                               0
forprior_country                        1987744
forprior_date                           1988144
inventor_total_num_patents                    0
lawyer_organization                      986064
lawyer_total_num_assignees                    0
lawyer_total_num_inventors                    0
lawyer_total_num_patents                      0
nber_category_id                              0
nber_subcategory_id                           0
number_apps                                   0
number_assignees                              0
number_assistant_examiners                    0
number_examiners                        

In [80]:
#pats_index = pats_index.reset_index('patent_number')

In [87]:
a = sorted(pats_index.patent_number.values)
b = sorted(df5.patent_number.values)

In [88]:
print(len(a))
print(len(b))
print(len(set(a).intersection(set(b))))
print(len(set(a).difference(set(b))))
print(len(set(b).difference(set(a))))

1994958
199999
188672
1806286
11327


In [76]:
newl = [x for x in a if ((x>4490856) & (x<4490901))]

In [89]:
# combine numerical attributes to claims processing
ind2 = pats_index.merge(df5[['description_source_filename',
 'description_word_count',
 'flag_has_dependent_claim_text',
 'flag_has_description_text',
 'flag_has_independent_claim_text',
 'flag_has_references',
 'indep_claims_filenames',
 'max_dependent_claim',
 'max_independent_claim',
 'max_reference_sequence',
 'min_reference_sequence',
 'number_dependent_claims_found',
 'number_description_found',
  'number_journal_references',
 'number_independent_claims_found',
 'number_references_found', 'patent_number']], 
                       how='inner', on='patent_number')

In [90]:
print(df4.shape)
print(pats_index.shape)
print(ind2.shape)

(199999, 21)
(1994958, 59)
(188672, 75)


In [91]:
save_zipped_pickle(ind2, '/pool001/lraymond/patent_data/index_files/sample_pat_nums_all_numerical.gzip')