In [40]:
"""
Final Data Extraction for State Capacity in Innovation

Author:
Joe Emmens

Date
16/12/2020
"""

'\nFinal Data Extraction\nJoe Emmens\n'

In [41]:
import os
import pandas as pd
import numpy as np
import itertools
import networkx as nx
from fuzzywuzzy import fuzz

In [42]:
"""
README:

    1) If running for the first time download the assignee key from the link provided
    as to create an up-to-date assignee dictionary and to dis-ambiguate the data.

    2) Download the desired county data sets from the link provided below.

    ****** IMPORTANT ******
    Save the county data in a separate file and direct the path the explicitly as directed

    3) Set a file name in the next code cell. The output will be saved in the working
    directory under this name

"""

"""
os.chdir("C:/Users/Joe's PC/Documents/IDEA Masters/Second Year/First Semester/Applied Public Economics/Term Paper")

#download the full assignee list from https://www.patentsview.org/download/

assignee = pd.read_csv("assignee_new.csv")

assignee_dict = {}
for idx in assignee.index:
    key = assignee.iloc[idx][0]
    value = assignee.iloc[idx][4]
    assignee_dict.update({key : value})

np.save('assignee_dict_new.npy', assignee_dict)
"""

'\nos.chdir("C:/Users/Joe\'s PC/Documents/IDEA Masters/Second Year/First Semester/Applied Public Economics/Term Paper")\n\n#download the full assignee list from https://www.patentsview.org/download/\n\nassignee = pd.read_csv("assignee_new.csv")\n\nassignee_dict = {}\nfor idx in assignee.index:\n    key = assignee.iloc[idx][0]\n    value = assignee.iloc[idx][4]\n    assignee_dict.update({key : value})\n\nnp.save(\'assignee_dict_new.npy\', assignee_dict)\n'

In [None]:
filename = "Set a file name to save the resulting data file to"
assignee_path = "Set to your path for the assignee data file"
county_data_path = "Set to your path for the county data files"

min_year = "input the first year of your data set"
max_year = "input the final year of your data set"

In [43]:
os.chdir(f"{assignee_path}")
assignee_dict = np.load('assignee_dict_new.npy', allow_pickle = "TRUE").item()

In [58]:
column_names = ["County", "State", "ID" , "Year", "AvgDeg", "StateBetNorm", "StateBetStd", "Patents"]

filename = pd.DataFrame(columns = column_names)

In [51]:
def CleanStateData(data, file_name="", save_file = False):

    data["assignee_id"] = data["assignee_id"].map(assignee_dict)
    data = data.dropna()


    for ass in data["assignee_id"].unique():
        str1 = ass
        for ass2 in data["assignee_id"].unique():
            str2 = ass2

            ratio = fuzz.ratio(str1.lower(), str2.lower())
            if 90 < ratio < 100:
                data["assignee_id"] = data["assignee_id"].replace(str2, str1, regex=True)


    data = data.sort_values("assignee_id")

    GovtInterest = data[data["govint_org_id"] != "None"]

    if save_file == True:
        data.to_csv(f"{file_name}(Clean).csv")

    return data


In [52]:
def CreateGraph(data, file_name="", save_graph=False):

    G = data["govint_org_id"].unique()

    govt={}
    govt.update(dict.fromkeys(G, 1))
    govt.update({"None" : 0})

    data["Govt Support"] = data["govint_org_id"].map(govt)

    print(data["Govt Support"].value_counts(normalize=True))


    data = data.sort_values("inventor_id")

    ### Create the graph ###
    state_edges = []

    for assignee in data["assignee_id"].unique():
        A = data[data["assignee_id"] == assignee]

        if 1 in A["Govt Support"].values:
            state_edges.append(("state", assignee))


    ### Add the edges ###
    edges = []

    for inventor in data["inventor_id"].unique():

        group = data[data["inventor_id"] == inventor]

        assignees = group["assignee_id"].unique()
        combinations = list(itertools.combinations(assignees, 2))

        for edge in combinations:

            edges.append(edge)

    nodes = data["assignee_id"].unique()
    nodes = np.insert(nodes, 0, "state")

    Graph = nx.Graph()

    Graph.add_nodes_from(nodes)
    Graph.add_edges_from(state_edges)
    Graph.add_edges_from(edges)

    if save_graph == True:

        nx.write_graphml(Graph, f"{file_name}.graphml")

    return Graph

In [59]:
os.chdir(f"{county_data_path}")

In [60]:
years = np.arange(min_year, max_year+1)

id = 0

for file in os.listdir():

    id = id+1
    county_data = pd.read_csv(file)

    State = file[-12:-10]
    County = file[:-14]

    for y in years:

        year_data = county_data[county_data["patent_year"] == y]

        clean_data = CleanStateData(data=year_data, file_name=f"{file}")

        Graph = CreateGraph(clean_data, file_name=f"{file}")

        Bnorm = nx.betweenness_centrality(Graph)

        StateBetNorm = Bnorm["state"]

        B = nx.betweenness_centrality(Graph, normalized=False)

        StateBetStd = B["state"]

        AvgDeg = (len(Graph.edges()) / len(Graph.nodes()))

        Patents = len(year_data["patent_number"].unique())

        data_file = data_file.append({"County" : County, "State" : State, "ID" : id, "Year" : y,
                                      "AvgDeg" : AvgDeg, "StateBetNorm" : StateBetNorm , "StateBetStd" : StateBetStd, "Patents" : Patents }, ignore_index=True)




data_file.to_csv(f"{filename}.csv")


Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Name: Govt Support, dtype: float64)
Series([], Na

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["assignee_id"] = data["assignee_id"].map(assignee_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["assignee_id"] = data["assignee_id"].replace(str2, str1, regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["assignee_id"] = data["assignee_id"].map(assignee_dict)
A value i

In [61]:
print("All done!")


All done! woowowowop
