# Check Scraping


In [25]:
import os

for CPC in os.listdir('/vast/marco/Data_Google_Patent/json/query'):
    json_path = os.listdir(f'/vast/marco/Data_Google_Patent/json/query/{CPC}')
    front_img_path = os.listdir(f'/vast/marco/Data_Google_Patent/front_imgs/query/{CPC}')
    print(f'Text: {len(json_path)}\t Image: {len(front_img_path)}\t CPC: {CPC}\t Type: query')

print('\n')

for CPC in os.listdir('/vast/marco/Data_Google_Patent/json/document'):
    json_path = os.listdir(f'/vast/marco/Data_Google_Patent/json/document/{CPC}')
    front_img_path = os.listdir(f'/vast/marco/Data_Google_Patent/front_imgs/document/{CPC}')
    print(f'Text: {len(json_path)}\t Image: {len(front_img_path)}\t CPC: {CPC}\t Type: document')


Text: 163	 Image: 163	 CPC: A42B3	 Type: query
Text: 79	 Image: 79	 CPC: A62B18	 Type: query
Text: 44	 Image: 44	 CPC: F04D17	 Type: query
Text: 46	 Image: 46	 CPC: F16H1	 Type: query
Text: 60	 Image: 60	 CPC: F16L1	 Type: query
Text: 306	 Image: 306	 CPC: G02C5	 Type: query
Text: 32	 Image: 32	 CPC: H02K19	 Type: query


Text: 502	 Image: 502	 CPC: A42B3	 Type: document
Text: 201	 Image: 201	 CPC: A62B18	 Type: document
Text: 103	 Image: 103	 CPC: F04D17	 Type: document
Text: 93	 Image: 93	 CPC: F16H1	 Type: document
Text: 171	 Image: 171	 CPC: F16L1	 Type: document
Text: 616	 Image: 616	 CPC: G02C5	 Type: document
Text: 79	 Image: 79	 CPC: H02K19	 Type: document


In [30]:
import json
truth_dir = '/vast/marco/Data_Google_Patent/ground_truth'

for file in os.listdir(truth_dir):
    file_path = os.path.join(truth_dir, file)
    with open(file_path, 'r') as f:
        data =json.load(f)
        total_elements = sum(len(value) for value in data.values() if isinstance(value, list))
        print(f'{file.replace(".json", "")}\tquery: {len(data)}\t document: {total_elements}')

# It is normal that: N. of queries from turth file <= N. of initial queries.
# This is because when scraping document patents, if no documents are succesfully retrieved for a query, then no file are saved for that query.
# When truth is constructed starting from filename of document patent, then queries with no corresponsing documetns are discarded.

A42B3	query: 154	 document: 502
A62B18	query: 66	 document: 201
F04D17	query: 34	 document: 103
F16H1	query: 39	 document: 93
F16L1	query: 55	 document: 171
G02C5	query: 242	 document: 616
H02K19	query: 26	 document: 79


# Calculate Dataset Analytics

In [1]:
import pandas as pd
import os
import json

query_json_folder = '/vast/marco/Data_Google_Patent/json/query'

stats = []
total_query_count = 0
total_document_count = 0
total_avg_document_per_query = 0
n_class = 0

for class_folder in os.listdir(query_json_folder):
        class_path = os.path.join(query_json_folder, class_folder)
        query_count = 0
        document_patents_count_list = []
        for json_file in os.listdir(class_path):
            json_path = os.path.join(class_path, json_file)
            query_count += 1
            # Open the json file for the patent query
            with open(json_path, 'r') as file:
                query_patent_data = json.load(file)
                document_patents_count = query_patent_data.get('document_patents_count')
                #print(document_patents_count)

            document_patents_count_list.append(document_patents_count)
            document_count = sum(document_patents_count_list)
            avg_document_per_query = round(sum(document_patents_count_list)/len(document_patents_count_list), 2)

        # Append the statistics for the class (folder) to the list
        stats.append({
            'Class': class_folder,
            'N_query': query_count,
            'N_document': document_count,
            'Avg_document_per_query': avg_document_per_query
            })
        
        total_query_count += query_count
        total_document_count += document_count
        total_avg_document_per_query += avg_document_per_query
        n_class += 1
        
# Append the total statistics as a new row
stats.append({
    'Class': 'Total',
    'N_query': total_query_count,
    'N_document': total_document_count,
    'Avg_document_per_query': round(total_avg_document_per_query/n_class, 2)
})

# Convert the list of statistics into a Pandas DataFrame
df = pd.DataFrame(stats)
df

Unnamed: 0,Class,N_query,N_document,Avg_document_per_query
0,A42B3,163,502,3.08
1,A62B18,79,201,2.54
2,F04D17,44,103,2.34
3,F16H1,46,93,2.02
4,F16L1,60,171,2.85
5,G02C5,306,616,2.01
6,H02K19,32,79,2.47
7,Total,730,1765,2.47
