In [1]:
import pickle
import funcy
import csv
import os
import gzip
import pandas as pd
import numpy as np

In [2]:
# Split description directory
index_file = '/pool001/lraymond/patent_data/index_files/pat_nums_index.csv'
#|index_file = '/pool001/lraymond/patent_data/index_files/pat_nums_plus_claims.gzip'
description_files_dir = '/nobackup1/lraymond/patent_data/split_brief_description'
to_save_descrip_dir = '/pool001/lraymond/patent_data/sample_description_dicts'
# I should be story these files in the lustre parallel file system
# '/nobackup1/lraymond/patent_data/split_brief_description'
# I want to stripe this directory over 20 servers to optimize performance 
# lfs setstripe --count 20 patent_data/split_brief_description


In [3]:
csv.field_size_limit(100000000)

131072

In [4]:
# load patent numbers index file
def load_zipped_pickle(filename):
    # load a zipped compressed pickle file
    with gzip.open(filename, 'rb') as f:
        loaded_object = pickle.load(f)
        return loaded_object
    

def save_zipped_pickle(obj, filename, protocol=-1):
    # this will default to the highest protocol
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol)

In [5]:
# pats_index = load_zipped_pickle(index_file)
pats_index = pd.read_csv(index_file)

pats_index = pats_index.sort_values('patent_number')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
# save the patent text in a dictionary
def create_empty_dict(patent_nums):
    # create map of text to patent numbers dictionaries
    ns = [None]*len(patent_nums)
    return dict(zip(patent_nums, ns))


def create_description_dataframe(patent_nums):
    nums = pd.Series(patent_nums).astype(int).drop_duplicates().copy()
    patent_random_sample = pd.DataFrame(0, 
        index=nums, columns=['number_description_found', 'description_word_count'])
    return patent_random_sample


def save_descr_text(patent_num, txt):
    dict_fname = os.path.join(to_save_descrip_dir, 'description_{}.gzip'.format(patent_num))
    if os.path.exists(dict_fname):
        # description is already saved, don't need to redo
        return None
    else:
        d = {patent_num: txt}
        save_zipped_pickle(d, dict_fname)


def process_description(patent_num, txt, len_text, text_dict, description_numerical_df):
    # mark that we have found a text string for the patent number, save text string to the dictionary
    num_found = description_numerical_df.loc[patent_num, 'number_description_found']
    if num_found==0:
        save_descr_text(patent_num, txt)
    else:
        print('Multiple Descriptions Found ', patent_num)
        text_dict[patent_num].append(txt)
    # update dataframe with max sequence of independent claim and number found
    description_numerical_df.loc[patent_num, 'description_word_count'] += len_text
    description_numerical_df.loc[patent_num, 'number_description_found'] += 1
    return None


def yield_file_lines(filename, list_patent_nums, valid_row_len, max_lines=None, patent_number_index=1):
    # create iterator to process file
    # create set to check membership
    # only returns lines with patent numbers in the set and those with a valid number of fields
    # row is returned as a list with a length equal to the number of fields available
    set_patent_nums = set(list_patent_nums)
    with open(filename) as tsvin:
        tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in tsvin:
            if max_lines and tsvin.line_num > max_lines:
                # if a max line limit, break out of the generator
                return
            else:
                # only process if the row has all fields
                if len(row) == valid_row_len:
                    patent_num_field = row[patent_number_index]
                    is_num = patent_num_field.isdigit()

                    if is_num and int(patent_num_field) in set_patent_nums:
                        # only yield relevant lines
                        yield row
                        
    
def process_description_information(description_filename, patent_nums_list, max_lines=None):
    description_filename = os.path.join(description_files_dir, description_filename)
    print(description_filename)
    text_dict = create_empty_dict(patent_nums_list)

    # note this is lines processed per file, so doesn't actually get that high
    lines_processed = 0

    description_numerical_df = create_description_dataframe(patent_nums_list)
    try:
        for line in yield_file_lines(
                description_filename, patent_nums_list, 3, max_lines, patent_number_index=1):
            # description field fields are uuid, str patent info and txt string
            if lines_processed%100==0:
                print(lines_processed)
            _, str_pat_num, txt = line
            # at  this point, I should only be getting lines that are valid numeric patent numbers
            patent_num = int(str_pat_num)
            len_txt = len(txt.split())
            if len_txt > 1:
                process_description(patent_num, txt, len_txt, text_dict, description_numerical_df)
                lines_processed += 1
                          
    except (Exception, KeyboardInterrupt) as e:
        print(e)
    finally:
        # always execute this piece of code even after an exception
        return description_numerical_df

In [9]:
# quick check for duplicates in the patent file
print(pats_index.shape)
print(pats_index.patent_number.drop_duplicates().shape)

(1994958, 59)
(1994958,)


In [10]:
txt_files = os.listdir(description_files_dir)

pats_list = pats_index.loc[
    (pats_index.is_valid_patent_number) , :].patent_number.values

In [None]:
%%time 
df_results = list(map(lambda x: process_description_information(x, pats_list), txt_files))

/nobackup1/lraymond/patent_data/split_brief_description/xlp
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xlh
0
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xul
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xjt
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xqk
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
190

2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
/nobackup1/lraymond/patent_data/split_brief_description/xji
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xsm
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
/nobackup1/lraymond/patent_data/split_brief_description/xqo
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xfp
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
/nobackup1/lraymond/patent_data/split_brief_description/xrl
0
100
200
300
400
500
6

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
/nobackup1/lraymond/patent_data/split_brief_description/xrb
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xox
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xpx
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
/nobackup1/lraymond/patent_data/split_brief_description/xuk
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000


In [19]:
#df_dict = dict(zip(txt_files, df_results))
df_res2 = list(map(lambda x: x.loc[x.number_description_found > 0, :], df_results))
del df_results

In [20]:
df_dict = dict(zip(txt_files, df_res2))

In [22]:
# saving the checkpoint in case the memory runs out
check_point_fname = os.path.join(to_save_descrip_dir, 'result_descr_dict.gzip')
print(check_point_fname)
save_zipped_pickle(df_dict, check_point_fname)

/pool001/lraymond/patent_data/sample_description_dicts/result_descr_dict.gzip


In [23]:
def concat_dfs(dict_results):
    # combine dataframes by adding a column with filename source
    # then reset index so patent number is a column, append all df together
    # then summarize by apply max/sum functions to respective  columns
    # process dataframe associated with the results
    list_df = []
    for fname, mini_df in dict_results.items():
        print(fname)
        mini_df['source_filename'] = fname
        mini_df = mini_df.reset_index(drop=False)
        mini_df = mini_df.rename(columns={'index': 'patent_number'})
        list_df.append(mini_df)
    # memory concerns so deleting dict results
    del dict_results
    return pd.concat(list_df, axis=0, join='outer', ignore_index=True)


In [24]:
 master_df = concat_dfs(df_dict)

xlp
xlh
xul
xjt
xqk
xtc
xdv
xud
xwy
xfm
xii
xqv
xir
xva
xvv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


xxj
xcn
xve
xsk
xck
xcl
xhf
xld
xsf
xtr
xdq
xbu
xop
xlx
xiu
xwx
xfy
xoz
xqy
xwi
xrf
xws
xbf
xxd
xji
xsm
xqo
xfp
xrl
xwb
xzg
xjv
xbs
xdj
xbz
xnv
xpb
xew
xse
xju
xib
xar
xsw
xku
xeu
xjg
xum
xiv
xim
xpr
xwf
xlb
xhs
xtm
xue
xru
xyu
xsr
xjf
xeo
xai
xsv
xgd
xdm
xrb
xox
xpx
xuk
xwl
xph
xsc
xqs
xfn
xmj
xnq
xzk
xid
xth
xlo
xqu
xnk
xki
xcp
xde
xkr
xpd
xvm
xyd
xof
xuv
xzf
xjw
xxp
xea
xqb
xcz
xje
xhi
xvx
xov
xmk
xlw
xls
xnj
xja
xxv
xcg
xon
xxz
xlz
xcf
xrh
xdy
xyh
xmb
xpf
xhu
xcr
xsu
xxg
xso
xkf
xvg
xzh
xcm
xsz
xlm
xpg
xtv
xfr
xxh
xza
xvk
xtf
xmq
xht
xwr
xyx
xjs
xrj
xft
xnc
xll
xpm
xec
xig
xzt
xlc
xta
xfd
xkh
xvo
xnw
xlg
xoc
xaa
xfh
xmp
xif
xwh
xqq
xed
xwv
xaf
xqd
xas
xor
xjz
xxa
xzy
xis
xlv
xej
xsn
xqj
xmx
xzw
xui
xoo
xwn
xvu
xfi
xcs
xjr
xep
xjc
xzb
xyg
xqt
xxw
xet
xss
xiw
xux
xxe
xoq
xtu
xty
xze
xxb
xcw
xoh
xev
xkz
xqr
xbi
xou
xqf
xvd
xdd
xzc
xpt
xjk
xjx
xsx
xaz
xkg
xsi
xtx
xre
xxn
xrw
xdb
xhz
xlk
xjy
xpi
xqn
xez
xka
xzr
xuz
xoj
xge
xqx
xxl
xrs
xjh
xem
xlt
xra
xqc
xbo
xbg
xpj
xjo
xnm
xow
xvl
xgw


In [29]:
del master_df['description_filenames']

master_df2 = master_df.sort_values('patent_number').drop_duplicates()

print(master_df.shape)
print(master_df2.shape)

In [36]:
pats_index2 = pats_index.merge(master_df2, on='patent_number', how='left')

In [38]:
pats_index2['number_description_found'] = pats_index2.number_description_found.fillna(0)
pats_index2['description_word_count'] = pats_index2.description_word_count.fillna(0)

pats_index2= pats_index2.rename(columns={'source_filename': 'description_source_filename'})

pats_index2['flag_has_description_text'] = pats_index2.number_description_found.apply(lambda x: int(x>0))

In [42]:
pats_index2.head()

Unnamed: 0,patent_year,patent_number,10_year_cites_top1,10_year_cites,is_valid_patent_number,max_independent_claim,max_dependent_claim,number_independent_claims_found,number_dependent_claims_found,indep_claims_filenames,flag_has_dependent_claim_text,flag_has_independent_claim_text,number_description_found,description_word_count,description_source_filename,description_word_found,flag_has_description_text
0,1985,4490856,0,1,True,2,0,2,0,xan ; xbj,0,1,1.0,522.0,xin,522.0,1
1,1985,4490860,0,2,True,8,0,8,0,xaz ; xbi ; xbg ; xbm ; xap ; xbp ; xdn,0,1,1.0,1012.0,xpv,1012.0,1
2,1985,4490861,0,3,True,10,0,10,0,xcy ; xaz ; xas ; xdd ; xag ; xcr ; xcn ; xbc ...,0,1,1.0,524.0,xey,524.0,1
3,1985,4490864,0,5,True,11,0,11,0,xcb ; xce ; xbr ; xaz ; xat ; xbf ; xbo ; xdq ...,0,1,0.0,,,0.0,0
4,1985,4490868,0,1,True,9,0,9,0,xcb ; xcw ; xdl ; xah ; xbo ; xch ; xcu ; xae ...,0,1,1.0,512.0,xdq,512.0,1


In [43]:
print(index_file)

/pool001/lraymond/patent_data/index_files/pat_nums_plus_claims.gzip


In [44]:
save_zipped_pickle(pats_index2, index_file)