# F-Term Crawler
Data will be crawled from https://www.j-platpat.inpit.go.jp/cache/classify/patent/PMGS_HTML/jpp/F_TERM/en/themeGroup/themeGroup_en.html

In [None]:
import pandas as pd
import numpy as np
from time import sleep
import requests 
from bs4 import BeautifulSoup

# Final

In [None]:
working_directory = r"C:\Users\paulb\OneDrive - rwth-aachen.de\04_Diss\F-Terms\data"

def get_html_text(theme):
    try:
        
        url=f'https://www.j-platpat.inpit.go.jp/cache/classify/patent/PMGS_HTML/jpp/F_TERM/en/fTermList/fTermList_en_{theme}.html'
        html=requests.get(url)
        return html.text
    except:
        print(f"could not retrieve html for theme {theme}")
    return None
    
def get_tables(html_text):
    soup = BeautifulSoup(html_text,"html5lib")
    topic_table = soup.find("table", attrs={"class":"table ftermListTable"})
    table = soup.find("table", attrs={"class":"table ftermPontTable"})
    return topic_table,table

def get_input():
    list_of_themes=pd.read_excel(fr"{working_directory}\list_all_theme.xlsx", sheet_name="Tabelle1")
    theme_list=list_of_themes["Theme code"].values
    
    return theme_list

###################################################

def get_theme_data(topic_table, table):
    
    #extracting general variables
   
    fterms=[]
    for x in table.find_all("tr"):
        if x.find("td", attrs={"class":"ftermTitle"}) != None:
            fterms.append(x.find("td", attrs={"class":"ftermTitle"}).text)
    
    
    fterms_height=[]
    for i, x in enumerate(table.find_all("tr")):
        if x.find_all("td", attrs={"class":"ftermTitle"}):
            fterms_height.append(i)
        else:
            continue
    
    
    
    
    viewpoints_temp=table.find_all("td", attrs={"class":"ftermData text-align-center"})
    viewpoints=[vp.get_text().strip("\n") for vp in viewpoints_temp]
    
    ###########################
    #iterating the rows to get viewpoints height to not rely on rowspan attributes
    vp_height_cumulative=[]
    for i, x in enumerate(table.find_all("tr")):
            if x.find_all("td", attrs={"class":"ftermData text-align-center"}):
                vp_height_cumulative.append(i)
            else:
                continue
    vp_height_cumulative.append(len(table.find_all("tr")))
    vp_height_absolute=[]
    for i, x in enumerate(vp_height_cumulative[:-1]):
        vp_height_absolute.append(vp_height_cumulative[i+1]-x)
    
    ##############################
    #same values, but this time from rowspan attribute
    
    viewpoints_height= [int(vp["rowspan"]) for vp in viewpoints_temp]
    #################################
    
    viewpoints_description_temp=table.find_all("td", attrs={"class":"ftermData ftermTableCellWidth text-align-center"})
    viewpoints_description= [vp.text.split("00")[1].strip("\n") for vp in viewpoints_description_temp]
    
    
    theme=topic_table.find_all("td", attrs={"class":"ftermListData"})[0].text.strip("\n")
    fi_class=topic_table.find_all("td", attrs={"class":"ftermListData"})[3].text.strip("\n")
    theme_label=topic_table.find_all("td", attrs={"class":"ftermListData"})[1].text.strip("\n")
    
    results_theme=pd.DataFrame(columns=["fi_class","theme", "theme_label","fterm","viewpoint","viewpoint_label", 
                                        "number", "label"])
    
    
    #test for errors in table by comparing sum of tr elements vs sum of viewpoint rowspans
    temp=table.find_all("td", attrs={"class":"ftermData text-align-center"})
    height=[int(x["rowspan"]) for x in temp]
    if len(table.find_all("tr"))!=sum(height):
        print(f"Error in Table for theme {theme}, lenghts of tr-element and viewpoint rows are different")
    #########################################    
    
    
    
    
    
    
    #iterating through viewpoints, adding data to results table
    for i,viewpoint in enumerate(viewpoints):
        results_viewpoint=pd.DataFrame(columns=["fi_class","theme", "theme_label","fterm","viewpoint","viewpoint_label", 
                                        "number", "label"])
        viewpoint_table_index=sum(vp_height_absolute[:i])

        #create list with all viewpoint entries:
        viewpoint_numbers_raw=[]
        for index in range (viewpoint_table_index,viewpoint_table_index+vp_height_absolute[i],2):
            viewpoint_numbers_raw+=table.find_all("tr")[index].find_all("td", attrs={"class":"ftermData ftermDataHead text-align-center"})

        viewpoint_numbers=[vp_num.text for vp_num in viewpoint_numbers_raw]


        viewpoint_label_raw=[]
        for index in range (viewpoint_table_index+1,viewpoint_table_index+vp_height_absolute[i]+1,2):

            viewpoint_label_raw+=table.find_all("tr")[index].find_all("td", attrs={"class":"ftermData ftermTopPosition"})

        viewpoint_label=[vp_entry.text for vp_entry in viewpoint_label_raw]

        #adding data to table
        results_viewpoint.number=viewpoint_numbers
        results_viewpoint.label=viewpoint_label
        results_viewpoint.viewpoint=viewpoints[i]
        results_viewpoint.viewpoint_label=viewpoints_description[i]

        


        #all viewpoints for this term have been added. now the fterm is added
       # print("here")
       # print(viewpoint_table_index)
        ft_index=list(map(lambda ix: ix<=viewpoint_table_index, fterms_height)).count(True)-1
        results_viewpoint.fterm=fterms[ft_index]
        
        
        results_theme = pd.concat([results_theme,results_viewpoint], ignore_index=True, sort=False)
        
    results_theme.theme=theme
    results_theme.fi_class=fi_class
    results_theme.theme_label=theme_label 
    return results_theme
    
    


    
    

In [None]:

theme_list=get_input()
#theme_list=["2B005"]
results=pd.DataFrame(columns=["fi_class","theme", "theme_label","fterm","viewpoint","viewpoint_label", 
                                        "number", "label"])
missing_tables=""
missing_url=""
for theme_index, theme in enumerate(theme_list):
    #retrieving url
    html_text=get_html_text(theme)
    
    if html_text==None or "404 Not Found" in html_text:
        print(f"skipping theme {theme}")
        missing_url+=theme+"\n"
        continue
        
    #extracting tables  
    topic_table, table=get_tables(html_text)
    
    if len(table)<2:
        print(f"no table for {theme}")
        missing_tables+= theme+"\n"
        sleep(1)
        continue
    else:
        results_theme=get_theme_data(topic_table, table)
    
    results = pd.concat([results, results_theme], ignore_index=True, sort=False)
    
    if theme_index%100==0:
        print (f"Theme Nr. {theme_index}")
        results.to_csv(fr"{working_directory}\f-terms.csv", encoding="utf-8", sep="\t")
    sleep(1)
    
    
with open(fr'{working_directory}\missing_tables.txt', 'w') as f:
    f.write(missing_tables)
with open(fr'{working_directory}\missing_url.txt', 'w') as f:
    f.write(missing_url)
        

no table for 3L029
no table for 3L030
no table for 3L031
no table for 3L032
no table for 3L033
no table for 3L035
no table for 3L038
no table for 3L039
no table for 3L040
no table for 3L041
no table for 3L042
no table for 3L043
no table for 3L047
no table for 3L052
no table for 3L057
no table for 3L059
no table for 3L063
no table for 3L064
no table for 3L066
no table for 3L067
no table for 3L074
no table for 3L075
no table for 3L082
no table for 3L084
no table for 3L085
no table for 3L088
no table for 3L089
no table for 3L090
no table for 3L091
no table for 3L094
no table for 3L095
no table for 3L096
no table for 3L097
no table for 3L100
no table for 3L101
no table for 3L106
no table for 3L107
no table for 3L108
Theme Nr. 1400
no table for 4B012
no table for 4B030
no table for 4B037
no table for 4B038
no table for 4B049
no table for 4B051
no table for 4B052
no table for 4B058
no table for 4B060
no table for 4B062
no table for 4B067
no table for 4B068
skipping theme 4C001
no table for 4

no table for 5J037
skipping theme 5J040
skipping theme 5J041
no table for 5J044
no table for 5J048
no table for 5J051
no table for 5J052
no table for 5J054
no table for 5J057
no table for 5J058
no table for 5J059
skipping theme 5J060
no table for 5J061
no table for 5J063
no table for 5J068
no table for 5J072
no table for 5J073
no table for 5J074
no table for 5J075
no table for 5J076
no table for 5J077
no table for 5J078
no table for 5J080
no table for 5J082
skipping theme 5K003
skipping theme 5K013
no table for 5K017
no table for 5K043
no table for 5K044
no table for 5K045
no table for 5K053
no table for 5K054
no table for 5K055
no table for 5K056
no table for 5K057
no table for 5K063
skipping theme 5K070
skipping theme 5K071
no table for 5L100
no table for 5L102
no table for 5L105


# sicherung (nicht mehr relevant)

In [None]:
def get_theme_data(topic_table, table):
    
    #extracting general variables
    viewpoints_temp=table.find_all("td", attrs={"class":"ftermData text-align-center"})
    viewpoints=[vp.get_text().strip("\n") for vp in viewpoints_temp]
    
    viewpoints_height= [int(vp["rowspan"]) for vp in viewpoints_temp]
    
    viewpoints_description_temp=table.find_all("td", attrs={"class":"ftermData ftermTableCellWidth text-align-center"})
    viewpoints_description= [vp.text.split("00")[1].strip("\n") for vp in viewpoints_description_temp]
    
    fterm=table.find("td", attrs={"class":"ftermTitle"}).text
    theme=topic_table.find_all("td", attrs={"class":"ftermListData"})[0].text.strip("\n")
    fi_class=topic_table.find_all("td", attrs={"class":"ftermListData"})[3].text.strip("\n")
    fterm_label=topic_table.find_all("td", attrs={"class":"ftermListData"})[1].text.strip("\n")
    
    results_theme=pd.DataFrame(columns=["fi_class","fterm","fterm_label","theme", "viewpoint","viewpoint_label", "number", "label"])
    
    
    #iterating through viewpoints, adding data to results table
    for i,viewpoint in enumerate(viewpoints):
        results_viewpoint=pd.DataFrame(columns=["fi_class","fterm","fterm_label","theme", "viewpoint","viewpoint_label", "number", "label"])
        viewpoint_table_index=sum(viewpoints_height[:i])

        #create list with all viewpoint entries:
        viewpoint_numbers_raw=[]
        for index in range (viewpoint_table_index,viewpoint_table_index+viewpoints_height[i],2):
            viewpoint_numbers_raw+=table.find_all("tr")[index].find_all("td", attrs={"class":"ftermData ftermDataHead text-align-center"})

        viewpoint_numbers=[vp_num.text for vp_num in viewpoint_numbers_raw]


        viewpoint_label_raw=[]
        for index in range (viewpoint_table_index+1,viewpoint_table_index+viewpoints_height[i]+1,2):
            
            print(index)
            viewpoint_label_raw+=table.find_all("tr")[index].find_all("td", attrs={"class":"ftermData ftermTopPosition"})

        viewpoint_label=[vp_entry.text.strip(". ") for vp_entry in viewpoint_label_raw]

        #adding data to table
        results_viewpoint.number=viewpoint_numbers
        results_viewpoint.label=viewpoint_label
        results_viewpoint.viewpoint=viewpoints[i]
        results_viewpoint.viewpoint_label=viewpoints_description[i]
        
        results_theme = pd.concat([results_theme,results_viewpoint], ignore_index=True, sort=False)
        
    results_theme.fterm=fterm
    results_theme.theme=theme
    results_theme.fi_class=fi_class
    results_theme.fterm_label=fterm_label  
    return results_theme
    
    