In [8]:
import polars as pl
### Load the datasets
# g_patent.tsv
df_patent = pl.read_csv(
    "g_patent.tsv",
    separator="\t",
    columns=["patent_id", "patent_date"],
    schema_overrides={"patent_id": pl.Utf8, "patent_date": pl.Utf8},
    ignore_errors=True
).with_columns([
    pl.col("patent_date").str.strptime(pl.Date, "%Y-%m-%d", strict=False)
])

# g_cpc_current.tsv
df_cpc = pl.read_csv(
    "g_cpc_current.tsv",
    separator="\t",
    columns=["patent_id", "cpc_class"],
    schema_overrides={"patent_id": pl.Utf8, "cpc_class": pl.Utf8}
)

# g_assignee_disambiguated.tsv
df_assignee = pl.read_csv(
    "g_assignee_disambiguated.tsv",
    separator="\t",
    columns=["patent_id", "location_id"],
    schema_overrides={"patent_id": pl.Utf8, "location_id": pl.Utf8}
)

# g_location_disambiguated.tsv
df_location = pl.read_csv(
    "g_location_disambiguated.tsv",
    separator="\t",
    columns=["location_id", "disambig_country"],
    schema_overrides={"location_id": pl.Utf8, "disambig_country": pl.Utf8}
)
# g_cpc_title.tsv
df_cpc_title = pl.read_csv(
    "g_cpc_title.tsv",
    separator="\t",
    columns=["cpc_class", "cpc_class_title"],
    schema_overrides={"cpc_class": pl.Utf8, "cpc_class_title": pl.Utf8}
)


In [9]:
df_pa = df_cpc.join(df_assignee, on="patent_id", how="inner")


In [10]:
df_pal = df_pa.join(df_location, on="location_id", how="inner")

In [11]:
df_full = df_pal.join(df_patent, on="patent_id", how="inner")

In [12]:
df_full_unique = df_full.unique(subset=['patent_id'])

In [13]:
##Dataset with full patent information
print(df_full.height)
print(df_full.head(10))

54918973
shape: (10, 5)
┌───────────┬───────────┬─────────────────────────────────┬──────────────────┬─────────────┐
│ patent_id ┆ cpc_class ┆ location_id                     ┆ disambig_country ┆ patent_date │
│ ---       ┆ ---       ┆ ---                             ┆ ---              ┆ ---         │
│ str       ┆ str       ┆ str                             ┆ str              ┆ date        │
╞═══════════╪═══════════╪═════════════════════════════════╪══════════════════╪═════════════╡
│ 3950001   ┆ A63       ┆ 8f95fbe8-16c8-11ed-9b5f-1234bd… ┆ CH               ┆ 1976-04-13  │
│ 3950001   ┆ A63       ┆ 8f95fbe8-16c8-11ed-9b5f-1234bd… ┆ CH               ┆ 1976-04-13  │
│ 3950001   ┆ A63       ┆ 8f95fbe8-16c8-11ed-9b5f-1234bd… ┆ CH               ┆ 1976-04-13  │
│ 3950002   ┆ A63       ┆ 8f95fbe8-16c8-11ed-9b5f-1234bd… ┆ CH               ┆ 1976-04-13  │
│ 3950002   ┆ A63       ┆ 8f95fbe8-16c8-11ed-9b5f-1234bd… ┆ CH               ┆ 1976-04-13  │
│ 3950002   ┆ A63       ┆ 8f95fbe8-16c8-11ed-9

In [14]:

import numpy as np

# ------------------------------------------------------
# 1. Agregar columna de año
# ------------------------------------------------------
df_full = df_full.with_columns([
    pl.col("patent_date").dt.year().alias("year")
])

# ------------------------------------------------------
# 2. Crear grid completo país × clase × año
# ------------------------------------------------------
countries = df_full.select("disambig_country").unique()
classes = df_full.select("cpc_class").unique()
years = df_full.select("year").unique()

grid = countries.join(classes, how="cross").join(years, how="cross")

# ------------------------------------------------------
# 3. Calcular componentes RCA con nunique (por patente_id)
# ------------------------------------------------------
df_counts = df_full.group_by(["disambig_country", "cpc_class", "year"]).agg(
    pl.col("patent_id").n_unique().alias("patents_country_class_year")
)

df_total_country_year = df_full.group_by(["disambig_country", "year"]).agg(
    pl.col("patent_id").n_unique().alias("total_country_year")
)

df_total_class_year = df_full.group_by(["cpc_class", "year"]).agg(
    pl.col("patent_id").n_unique().alias("total_class_year")
)

df_total_year = df_full.group_by("year").agg(
    pl.col("patent_id").n_unique().alias("total_global_year")
)

# ------------------------------------------------------
# 4. Unir todo al grid y calcular RCA
# ------------------------------------------------------
df_rca = (
    grid
    .join(df_counts, on=["disambig_country", "cpc_class", "year"], how="left")
    .join(df_total_country_year, on=["disambig_country", "year"], how="left")
    .join(df_total_class_year, on=["cpc_class", "year"], how="left")
    .join(df_total_year, on="year", how="left")
    .fill_null(0)
    .with_columns([
        (
            (pl.col("patents_country_class_year") / pl.col("total_country_year")) /
            (pl.col("total_class_year") / pl.col("total_global_year"))
        ).alias("rca")
    ])
    .with_columns([
        (pl.col("rca") > 1).cast(pl.Int8).alias("rca_binary")
    ])
)

# ------------------------------------------------------
# 5. Pivotear a matriz país × clase → rca_binary
# ------------------------------------------------------
df_rca_full = df_rca.select(["disambig_country", "cpc_class", "rca_binary",'year']).to_pandas()




KeyboardInterrupt: 

In [None]:
print(df_rca_full.head(10))

  disambig_country cpc_class  rca_binary  year
0               AU       C10           1  1986
1               AU       C10           0  1983
2               AU       C10           0  1989
3               AU       C10           0  1980
4               AU       C10           0  1977
5               AU       C10           0  1995
6               AU       C10           0  2010
7               AU       C10           0  2007
8               AU       C10           0  1998
9               AU       C10           0  1992
(1157772, 4)
Index(['disambig_country', 'cpc_class', 'rca_binary', 'year'], dtype='object')


In [None]:
import numpy as np
import pandas as pd

def calc_discrete_proximity(mcp, asymmetric=False):
    """
    Calculate product proximity matrix φᵢⱼ using the binary RCA matrix (MCP).

    Parameters:
    - mcp: 2D numpy array (rows = locations, columns = technologies), values are 0 or 1
    - asymmetric: bool, whether to return asymmetric proximity (default: False for symmetric)

    Returns:
    - φᵢⱼ proximity matrix as a numpy array
    """
    # Ubiquity: number of locations specialized in each technology
    ubiquity = mcp.sum(axis=0)

    # Compute φᵢⱼ = (# co-occurrence) / (# total occurrences of j)
    phi = mcp.T @ mcp  # co-occurrence matrix
    with np.errstate(divide='ignore', invalid='ignore'):
        phi = phi / ubiquity[np.newaxis, :]  # divide each column j by ubiquity of j
        phi[np.isnan(phi)] = 0.0  # handle divisions by zero

    if not asymmetric:
        phi = np.minimum(phi, phi.T)  # make symmetric

    return phi


In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors


def calc_density(rca_or_mcp, proximity_mat, knn=None):
    """Calculate density, as defined by Hidalgo et. al. (2007)

    Args:
        rca_or_mcp: numpy array of RCA (if continuous product proximities are
            used), else Mcp
        proximity_mat: product proximity matrix
        knn: number of nearest neighbors to consider for density calculation (optional)

    Returns:
        numpy array of same shape as proximity_mat corresponding to density of
        each product
    """
    if knn is None:
        den = np.nansum(proximity_mat, axis=1)[np.newaxis, :]
        # density = rca_or_mcp @ (proximity_mat / den)
        density = rca_or_mcp @ (proximity_mat.T / den)
    else:
        # Convert proximity matrix to a distance matrix
        distance_mat = 1 - proximity_mat
        # Get proximity to k nearest neighbors
        nbrs = NearestNeighbors(n_neighbors=knn, metric="precomputed").fit(distance_mat)
        distance_knn, indices_knn = nbrs.kneighbors()
        # Get proximity
        proximity_knn = 1 - distance_knn
        # Calculate density
        # Get denominator
        den = np.nansum(proximity_knn, axis=1)
        density = []
        for i, row in enumerate(indices_knn):
            # Use row to subset rca_or_mcp
            rca_knn_p = rca_or_mcp[np.arange(rca_or_mcp.shape[0])[:, np.newaxis], row]
            # Get distance_knn for this row
            proximity_knn_row = proximity_knn[i]
            # Divide by den
            proximity_knn_row = proximity_knn_row / den[i]
            # Multiply each row of rca_knn_p by proximity_knn_row
            num_p = rca_knn_p * proximity_knn_row
            # Sum across columns
            density_p = np.nansum(rca_knn_p, axis=1)
            density.append(density_p)
        density = np.array(density).T
    return density

In [None]:
def add_relatedness_density(df_rca_full):
    all_density = []

    for year in sorted(df_rca_full['year'].unique()):
        # Filtrar por año
        df_year = df_rca_full[df_rca_full['year'] == year]
        
        # Crear matriz MCP (país × clase)
        mcp_df = df_year.pivot_table(index='disambig_country',
                                     columns='cpc_class',
                                     values='rca_binary',
                                     fill_value=0)
        mcp = mcp_df.values

        # Calcular relatedness φᵢⱼ
        phi = calc_discrete_proximity(mcp)

        # Calcular density
        density = calc_density(mcp, phi)

        # Volver a DataFrame
        df_density = pd.DataFrame(density, index=mcp_df.index, columns=mcp_df.columns).reset_index()
        df_density = df_density.melt(id_vars='disambig_country',
                                     var_name='cpc_class',
                                     value_name='relatedness_density')
        df_density['year'] = year
        all_density.append(df_density)

    # Unir resultados y agregar al df_rca_full
    df_density_all = pd.concat(all_density, ignore_index=True)
    df_rca_full = df_rca_full.merge(df_density_all, on=['disambig_country', 'cpc_class', 'year'], how='left')

    return df_rca_full


In [None]:
df_rca = add_relatedness_density(df_rca_full)
print(df_rca_full.head(10))
print(df_rca_full.shape)

NameError: name 'df_rca_full' is not defined

In [None]:
# Asegúrate de que ambas columnas tengan el mismo tipo y nombre
# Renombrar columnas si es necesario
df_rd_all=pd.read_csv("relatedness_density.csv")
df_rd_all.rename(columns={"disambig_country": "country"}, inplace=True)
df_model_input.rename(columns={"disambig_country": "country"}, inplace=True)

# Unir ambos dataframes en base a país, tecnología y año

##Esta bien que haga el merge con el dataset original del RCA para construir el dataset final?
df_model_merged = pd.merge(
    df_model_input,
    df_rd_all,
    how="left",
    on=["country", "cpc_class", "year"]
)
df_model_merged["relatedness_density"] = df_model_merged["relatedness_density"].fillna(0.0)

print(df_model_merged.head(10).reset_index(drop=True),df_model_merged.shape)

  country cpc_class  year  patents_country_class_year  total_country_year  \
0      AU       C10  1986                           3                 191   
1      AU       C10  1983                           3                 134   
2      AU       C10  1989                           2                 286   
3      AU       C10  1980                           1                 158   
4      AU       C10  1977                           1                 128   
5      AU       C10  1995                           2                 312   
6      AU       C10  2010                           1                1402   
7      AU       C10  2007                           3                 998   
8      AU       C10  1998                           2                 469   
9      AU       C10  1992                           2                 242   

   total_class_year  total_global_year       rca  rca_binary  \
0               859              58155  1.063364           1   
1              1082     

In [None]:
df_model_merged.drop(
    columns=[
        "patents_country_class_year",
        "total_country_year",
        "total_class_year",
        "total_global_year"
    ],
    inplace=True
)


In [None]:
print(df_model_merged.head(10).reset_index(drop=True),df_model_merged.shape)

  country cpc_class  year       rca  rca_binary  relatedness_density
0      AU       C10  1986  1.063364           1            58.034941
1      AU       C10  1983  0.968687           0            32.913774
2      AU       C10  1989  0.514812           0            50.476317
3      AU       C10  1980  0.305670           0            30.711321
4      AU       C10  1977  0.365882           0            29.859741
5      AU       C10  1995  0.672672           0            43.895990
6      AU       C10  2010  0.136889           0            44.333343
7      AU       C10  2007  0.755925           0            33.140847
8      AU       C10  1998  0.604990           0            53.945137
9      AU       C10  1992  0.759861           0            53.731640 (1157772, 6)


In [None]:
df_model_merged = df_model_merged.sort_values(by=["country", "cpc_class", "year"])

def compute_Mt(group):
    group = group.sort_values("year")
    group["M_t"] = (
        (group["rca_binary"] >= 1) & ##condicion actual
        (group["rca_binary"].shift(1) < 1) & # no la tenía en t−1
        (group["rca_binary"].shift(2) < 1) & # ni en t−2
        (group["rca_binary"].shift(3) < 1)  # ni en t−3
    ).astype(int)
    return group

df_model_merged = df_model_merged.groupby(["country", "cpc_class"], group_keys=False).apply(compute_Mt)


In [None]:
import pycountry

# Función para convertir de Alpha-2 a Alpha-3
def alpha2_to_alpha3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except:
        return None

# Reemplazar directamente la columna 'country'
df_model_merged["country"] = df_model_merged["country"].apply(alpha2_to_alpha3)
print(df_model_merged.head(10))

        country cpc_class  year  rca  rca_binary  relatedness_density  M_t
1134088     AND       A01  1976  NaN           1                  0.0    0
1134060     AND       A01  1977  NaN           1                  0.0    0
1134073     AND       A01  1978  NaN           1                  0.0    0
1134090     AND       A01  1979  NaN           1                  0.0    0
1134059     AND       A01  1980  NaN           1                  0.0    0
1134072     AND       A01  1981  NaN           1                  0.0    0
1134091     AND       A01  1982  NaN           1                  0.0    0
1134057     AND       A01  1983  NaN           1                  0.0    0
1134080     AND       A01  1984  NaN           1                  0.0    0
1134093     AND       A01  1985  NaN           1                  0.0    0


In [None]:

df_gdp_raw = pd.read_csv("GDP.csv", skiprows=4, header=0)
df_gdp = df_gdp_raw.melt(
    id_vars=["Country Code"], 
    var_name="year", 
    value_name="gdp_per_capita"
)

# Convertir año (strings como '1960', '2020') a enteros, ignorando errores
df_gdp["year"] = pd.to_numeric(df_gdp["year"], errors="coerce")

# Eliminar filas donde year no sea un número
df_gdp = df_gdp.dropna(subset=["year"])

# Cambiar tipo a entero limpio
df_gdp["year"] = df_gdp["year"].astype(int)

# Renombrar
df_gdp.rename(columns={"Country Code": "country"}, inplace=True)
print(df_gdp.head(10))

### GDP > x

    country  year gdp_per_capita
798     ABW  1960            NaN
799     AFE  1960     186.132432
800     AFG  1960            NaN
801     AFW  1960     121.938353
802     AGO  1960            NaN
803     ALB  1960            NaN
804     AND  1960            NaN
805     ARB  1960            NaN
806     ARE  1960            NaN
807     ARG  1960            NaN


In [None]:
df_input_ML = df_model_merged.merge(df_gdp, on=["country", "year"], how="left")
green_classes = [
    "A01", "A23", "A43", "A47", "A61", "B01", "B03", "B09", "B22", "B29", "B30",
    "B62", "B63", "B65", "C02", "C03", "C04", "C05", "C08", "C09", "C10", "C12",
    "C21", "C22", "D01", "D21", "E01", "E02", "E03", "F01", "F02", "F16", "F17",
    "F23", "F27", "G01", "G08", "H01", "Y02"
]

ket_classes = [
    # Nanotechnology
    "Y01", "B82", 
    
    # Micro- and Nanoelectronics
    "H01", "H05", "F21", "Y01", "H03", 
    
    # Photonics
    "F21", "G02", "H01", "H02", "H05", 
    
    # Industrial Biotechnology
    "C02", "C07", "C12", "C08", "C09", "G01","A61",
    
    # Advanced Materials
    "B32", "C01", "C04", "C08", "C22", "D21", "H01","Y01",
    
    # Advanced Manufacturing Technologies
    "B03", "B06", "B07", "B23", "G01", "G05", "G06","G07", "G08",
    "A21", "A22", "A23", "A24", "A41", "A42", "A43", "B01", "B02", "B03", "B05",
    "B07", "B08", "B21", "B22", "B23", "B24", "B25", "B26", "B27", "B28", "B30", "B31",
    "B41", "B42", "B44", "B65", "B67", "B68", "C13", "C14", "C23", "D01", "D02", "D03",
    "D04", "D05", "D06", "D21", "E01", "E02", "E21", "F04", "F16", "F26", "G01", "H05"
]

df_input_ML["is_green"] = df_input_ML["cpc_class"].isin(green_classes).astype(int)

print(df_input_ML.head(10).reset_index(drop=True),df_input_ML.shape)


  country cpc_class  year  rca  rca_binary  relatedness_density  M_t  \
0     AND       A01  1976  NaN           1                  0.0    0   
1     AND       A01  1977  NaN           1                  0.0    0   
2     AND       A01  1978  NaN           1                  0.0    0   
3     AND       A01  1979  NaN           1                  0.0    0   
4     AND       A01  1980  NaN           1                  0.0    0   
5     AND       A01  1981  NaN           1                  0.0    0   
6     AND       A01  1982  NaN           1                  0.0    0   
7     AND       A01  1983  NaN           1                  0.0    0   
8     AND       A01  1984  NaN           1                  0.0    0   
9     AND       A01  1985  NaN           1                  0.0    0   

  gdp_per_capita  is_green  
0    7721.288586         1  
1    8167.922862         1  
2    9409.508513         1  
3   11996.407286         1  
4   12474.925292         1  
5   10465.260464         1  
6   