# Figure plotting

Codes for reproducing the main figures by SciSciNet data. All the figures use the same dataframe merged from the processed data. You can find this data in the path "data/processed/PaperID_KI2-Dopen_nok_control.pickle".

## Figure 1B-C
Loading data

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
pre_path = os.path.abspath(r"..")


# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_control.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# extract df_B_C containing [KI_type,'Year','Field'] for Figure 1C
df_B_C = selected_df[[KI_type,'Year','Field']]
df_B_C = df_B_C.dropna()

Plotting

In [None]:
# Set seaborn style
palette_a = sns.color_palette("deep", 10) + sns.color_palette("vlag", 9)

nrows_base, ncols_base = 1, 2
fig,[ax1,ax2]=plt.subplots(nrows=nrows_base, ncols=ncols_base, figsize=(10, 4))
fig.subplots_adjust(hspace=.35,wspace=.35) # hspace,wspace


# Plot Figure 1B: Distribution of KI values
sns.histplot(data=df_B_C, x=KI_type, bins=50, kde=False, stat='probability', color='#96B6D8', ax=ax1)
ax1.set_ylabel("Fraction")
ax1.set_xlabel("KI")
ax1.set_xlim(-1.05,1.05)
ax1.set_xticks(np.arange(-1, 1.1, 0.5))


# Plot Figure 1C: Temporal patten of KI values across Fields
field_list = ['Physics', 'Biology', 'Medicine', 'Chemistry', 'Psychology', 'Engineering', 'Mathematics', 'Sociology', 'Economics', 'Philosophy', 'Geography', 
              'Art', 'History', 'Geology', 'Business', 'Political science', 'Materials science', 'Computer science', 'Environmental science']
sns.lineplot(data=df_B_C, x='Year', y=KI_type, hue='Field', hue_order=field_list, palette=palette_a, #marker='s',
                dashes=False, legend='full', ax=ax2)
ax2.set_ylabel("KI")
ax2.set_xlabel("Year")
ax2.set_xlim(1949,2021+2)
ax2.set_xticks(range(1950, 2021, 10))

# Set the legend for Figure 1C
handles, labels = ax2.get_legend_handles_labels()
legend = ax2.legend(handles, labels, loc='upper center')
# Divide the legend into two parts
cutoff = 11
first_legend_handles = handles[:cutoff]
first_legend_labels = labels[:cutoff]
second_legend_handles = handles[cutoff:]
second_legend_labels = labels[cutoff:]
# Create the first legend
first_legend = ax2.legend(first_legend_handles, first_legend_labels, loc='lower left', title_fontsize=10, ncol=1, fontsize=6, 
            shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
ax2.add_artist(first_legend)
# Create the second legend
second_legend = ax2.legend(second_legend_handles, second_legend_labels, loc='lower center', bbox_to_anchor=(0.42, 0), title_fontsize=10, ncol=1, fontsize=6, 
            shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)


# Add annotations for the subplots
note_size = 25
x_pos, y_pos = -0.2, 1.16 # position of the annotation in axes fraction
ax1.annotate(chr(97 + 1).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=note_size, fontfamily='Arial', color='black', fontweight='bold')
ax2.annotate(chr(97 + 2).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=note_size, fontfamily='Arial', color='black', fontweight='bold')

plt.show()
fig.savefig('%s/results/figures/Fig1b_c.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)

## Figure 2A-D

In [None]:
import os
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
pre_path = os.path.abspath(r"..")


def rank(J_):
    """
    Rank the values in J_ and return a list of ranks in percentiles.
    We record the highest rank of each unique value in J_ for which appears multiple times.
    """
    J_sorted = sorted(J_)
    N = len(J_)
    J_rank_dict = {}
    for i in range(N):
        J_rank_dict[J_sorted[i]] = (i+1)*100/N
    
    J_rank_list = []
    for j in J_:
        J_rank_list.append(J_rank_dict[j])

    return J_rank_list


def rank_bin(J_,bin_rank_list,label_rank_list):
    """
    Rank the values in J_ and return a binned rank list based on predefined bins.
    """
    J_rank_list = rank(J_)
    J_rank_bin = pd.cut(x = J_rank_list, bins = bin_rank_list, labels = label_rank_list, include_lowest = True)
    return J_rank_bin
    

def binary_check(D_list):
    """
    Check if the values in D_list are greater than 0 and return a binary list.
    """
    D_tag = [0]*len(D_list)
    for i in range(len(D_list)):
        if D_list[i] > 0:
            D_tag[i] = 1
    return D_tag


def convert_to_decade(year):
    """
    Convert a year to its corresponding decade.
    For example, 1987 becomes '1980s', 2001 becomes '2000s'.
    """
    if type(year) is not int:
        decade = None
    else:
        decade = (year // 10) * 10
    return f"{decade}s"

Loading data

In [None]:
# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_control.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# drop rows with NaN values in the columns [KI_type, DC_type]
selected_df = selected_df.dropna(subset=[KI_type, DC_type])

# calculate the percentile ranks and binary tags of KI and DC
bin_rank_list = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
label_rank_list = [2.5,7.5,12.5,17.5,22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5,87.5,92.5,97.5]
selected_df['KI_percentile_bin'] = rank_bin(list(selected_df['KI']),bin_rank_list,label_rank_list)
selected_df['DC_percentile'] = rank(list(selected_df['DC']))
selected_df['DC_positive_tag'] = binary_check(list(selected_df['DC']))
# calculate the percentile ranks of C5
selected_df['C5_percentile_bin'] = pd.qcut(x = selected_df['C5'],q=5,labels=['0-20%','20-40%','40-60%','60-80%','80-100%'])
# convert the 'Year' column to decade format
selected_df['Decade'] = selected_df['Year'].apply(convert_to_decade)

Plotting

In [None]:
# Set seaborn style
nrows, ncols = 2, 2
fig,[[ax1,ax2],[ax3,ax4]]=plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 4*2))
fig.subplots_adjust(hspace=.35,wspace=.35) # hspace,wspace

# Plot Figure 2A: relationship between Disruption and Knowledge Independence.
color_a, color_b = '#245297', '#62AA67'
sns.lineplot(data=selected_df, x='KI_percentile_bin', y="DC_percentile", color=color_a, ax=ax1)
ax1.set_xlabel('KI percentile')
ax1.set_ylabel('Disruption percentile', color=color_a)
ax1.tick_params(axis='y', labelcolor=color_a)

# Setup the right y axis for Figure 2A
ax11 = ax1.twinx()
sns.lineplot(data=selected_df, x='KI_percentile_bin', y='DC_positive_tag', color=color_b, ax=ax11)
ax11.set_ylabel('Disruption positive ratio', color=color_b)
ax11.tick_params(axis='y', labelcolor=color_b)

# Add annotations for Figure 2A
ax1.annotate(chr(97 + 0).upper(), xy=(-0.2, 1.16), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=25, fontfamily='Arial', color='black', fontweight='bold')


# Plot Figure 2B, 2C, 2D: Conditioning on paper's impact C5, publication decade, and field
condition_list = ['C5_percentile_bin', 'Decade', 'Field'] # Conditions for Figure 2B, 2C, 2D
condition_labels_list = [['0-20%','20-40%','40-60%','60-80%','80-100%'], 
                        ['1950s','1960s','1970s','1980s','1990s','2000s','2010s'],
                        ['Physics', 'Biology', 'Medicine', 'Chemistry', 'Psychology', 'Engineering', 'Mathematics', 
                         'Sociology', 'Economics', 'Philosophy', 'Geography', 'Art', 'History', 'Geology', 'Business', 
                         'Political science', 'Materials science', 'Computer science', 'Environmental science']]
palette_list = [sns.color_palette('Blues', 5),
                sns.color_palette('OrRd', 7),
                sns.color_palette("deep", 10) + sns.color_palette("vlag", 9)]
legend_title_list = ['Impact $C_{5}$ percentile', '', '']
for idx in range(len(condition_list)):
    condition = condition_list[idx]
    condition_labels = condition_labels_list[idx]
    palette_ = palette_list[idx]
    legend_title = legend_title_list[idx]
    ax = [ax2, ax3, ax4][idx]  # Select the corresponding axis for each condition

    df_condition = selected_df[[KI_type,DC_type,condition]]
    df_condition = df_condition.dropna() # drop rows with NaN values in the condition column

    # Plot Figure 2B: Conditioning on paper's impact C5
    sns.lineplot(data=df_condition, x='KI_percentile_bin', y="DC_percentile", hue=condition, 
                 hue_order=condition_labels, palette=palette_, dashes=False, legend='full', ax=ax)
    ax.set_xlabel('KI percentile')
    ax.set_ylabel("Disruption percentile")

    # set the legend for Figure 2B, 2C
    if condition != 'Field':
        ax.legend(title=legend_title, title_fontsize=10, loc='best', ncol=1, fontsize=8, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
    # set custom legend for Figure 2D
    else:
        handles, labels = ax.get_legend_handles_labels()
        legend = ax.legend(handles, labels, loc='upper center')
        # Divide the legend into two parts
        cutoff = 11
        first_legend_handles = handles[:cutoff]
        first_legend_labels = labels[:cutoff]
        second_legend_handles = handles[cutoff:]
        second_legend_labels = labels[cutoff:]
        # Create the first legend
        first_legend = ax.legend(first_legend_handles, first_legend_labels, loc='upper left', title_fontsize=10, ncol=1, fontsize=7, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
        ax.add_artist(first_legend)
        # Create the second legend
        second_legend = ax.legend(second_legend_handles, second_legend_labels, loc='upper center', bbox_to_anchor=(0.5, 1), title_fontsize=10, ncol=1, fontsize=7, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)


    # Add annotations for the subplots
    ax.annotate(chr(97 + idx+1).upper(), xy=(-0.2, 1.16), xycoords='axes fraction',
                xytext=(5, -5), textcoords='offset points', ha='left', va='top',
                fontsize=25, fontfamily='Arial', color='black', fontweight='bold')
plt.show()
fig.savefig('%s/results/figures/Fig2a_d.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)

## Figure 2E

In [None]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns


def rank(J_):
    """
    Rank the values in J_ and return a list of ranks in percentiles.
    We record the highest rank of each unique value in J_ for which appears multiple times.
    """
    J_sorted = sorted(J_)
    N = len(J_)
    J_rank_dict = {}
    for i in range(N):
        J_rank_dict[J_sorted[i]] = (i+1)*100/N
    
    J_rank_list = []
    for j in J_:
        J_rank_list.append(J_rank_dict[j])

    return J_rank_list


def rank_bin(J_):
    """
    Rank the values in J_ and return a binned rank list based on predefined bins.
    """
    # Set parameters of binning and label
    bin_rank_list = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
    label_rank_list = [2.5,7.5,12.5,17.5,22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5,87.5,92.5,97.5]
    
    J_rank_list = rank(J_)
    J_rank_bin = pd.cut(x = J_rank_list, bins = bin_rank_list, labels = label_rank_list, include_lowest = True)
    return J_rank_bin
    

def binary_check(D_list):
    """
    Check if the values in D_list are greater than 0 and return a binary list.
    """
    D_tag = [0]*len(D_list)
    for i in range(len(D_list)):
        if D_list[i] > 0:
            D_tag[i] = 1
    return D_tag


def plot_p(DF_data,DF_p_value,KI_size_bins,ax,width=16,heigth=12,widthx = 0, widthy = -0.15, annot_fontsize=5.5):
    """
    Plot a heatmap of the data in DF_data with annotations and significance markers.
    """
    # Set parameters
    vmax = DF_data.abs().max().max()
    im1 = sns.heatmap(DF_data, annot=True, cmap='vlag', center=0 , vmax=vmax, vmin=-vmax, square=True, 
                      fmt='.2f', ax = ax, cbar=False, annot_kws={"color": "k", "fontsize": annot_fontsize})
    
    ax.set_title('ATT on Disruption percentile')
    ax.set_xlabel('Controlled KI percentile')
    ax.set_ylabel('Treated KI percentile')
    ax.set_xticklabels(KI_size_bins, rotation = 0, horizontalalignment='center', fontsize = 'x-small')
    ax.set_yticklabels(KI_size_bins, rotation = 0, horizontalalignment='right', fontsize = 'x-small')

    # Add significance markers
    for m in ax.get_xticks():
        for n in ax.get_yticks():
            if m == n: continue
            pv = (DF_p_value.values[int(m),int(n)])
            if  pv< 0.05 and pv>= 0.01:
                ax.text(n+widthx,m+widthy,'*',ha = 'center',color = 'k', fontsize = annot_fontsize)
            if  pv< 0.01 and pv>= 0.001:
                ax.text(n+widthx,m+widthy,'**',ha = 'center',color = 'k', fontsize = annot_fontsize)
            if  pv< 0.001:
                ax.text(n+widthx,m+widthy,'***',ha = 'center',color = 'k', fontsize = annot_fontsize)
    plt.tight_layout()

In [None]:
# Define the regression type and treatment effect type
data_type = 'raw'       # data_type: raw or normalized
regression_type = 'glm' # regression_type: glm or logit
treat_effect_type='ATT' # treat_effect_type: ATT or ATC

# Load the data
KI_size_bins = [2.5,7.5,12.5,17.5,22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5,87.5,92.5,97.5]
DF_data = pd.DataFrame(columns=KI_size_bins, index=KI_size_bins)    # DataFrame to store effect sizes
DF_p_value = pd.DataFrame(columns=KI_size_bins, index=KI_size_bins) # DataFrame to store p-values
for control_size in KI_size_bins:
    df_temp = pd.read_csv('%s/results/results_for_tables/PSM_Analysis_Results/ALL_%s_%s_%s.csv'%(pre_path,control_size,treat_effect_type,regression_type))
    DF_data[control_size] = df_temp['Effect Size'].tolist()
    DF_p_value[control_size] = df_temp['P value'].tolist()

# Plot the heatmap
fig,ax5=plt.subplots(nrows=1, ncols=1, figsize=(10*.75, 8*.75))
fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace
plot_p(DF_data,DF_p_value,KI_size_bins,ax5)

# Add annotations for the subplots
note_size = 25
x_pos, y_pos = -0.2*.65, 1+.16*.5
ax5.annotate(chr(97 + 4).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=note_size*.75, fontfamily='Arial', color='black', fontweight='bold')

plt.show()
fig.savefig('%s/results/figures/Fig2e.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)

## Figure 3

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import linregress
import matplotlib.pylab as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import seaborn as sns
pre_path = os.path.abspath(r"..")


def rank(J_):
    """
    Rank the values in J_ and return a list of ranks in percentiles.
    We record the highest rank of each unique value in J_ for which appears multiple times.
    """
    J_sorted = sorted(J_)
    N = len(J_)
    J_rank_dict = {}
    for i in range(N):
        J_rank_dict[J_sorted[i]] = (i+1)*100/N
    
    J_rank_list = []
    for j in J_:
        J_rank_list.append(J_rank_dict[j])

    return J_rank_list


def rank_bin(J_,bin_rank_list,label_rank_list):
    """
    Rank the values in J_ and return a binned rank list based on predefined bins.
    """
    J_rank_list = rank(J_)
    J_rank_bin = pd.cut(x = J_rank_list, bins = bin_rank_list, labels = label_rank_list, include_lowest = True)
    return J_rank_bin
    

def binary_check(D_list):
    """
    Check if the values in D_list are greater than 0 and return a binary list.
    """
    D_tag = [0]*len(D_list)
    for i in range(len(D_list)):
        if D_list[i] > 0:
            D_tag[i] = 1
    return D_tag


def convert_to_decade(year):
    """
    Convert a year to its corresponding decade.
    For example, 1987 becomes '1980s', 2001 becomes '2000s'.
    """
    if type(year) is not int:
        decade = None
    else:
        decade = (year // 10) * 10
    return f"{decade}s"

Loading data

In [None]:
# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_control.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# drop rows with NaN values in the columns [KI_type, DC_type]
selected_df = selected_df.dropna(subset=[KI_type, DC_type])

# calculate the percentile ranks and binary tags of KI and DC
bin_rank_list = [0,20,40,60,80,100]
label_rank_list = ['0-20%','20-40%','40-60%','60-80%','80-100%']
selected_df['KI_percentile'] = rank(list(selected_df[KI_type]))
selected_df['DC_percentile'] = rank(list(selected_df[DC_type]))
selected_df['KI_percentile_bin'] = rank_bin(list(selected_df[KI_type]),bin_rank_list,label_rank_list)

# convert other variables to proper formats
selected_df['Decade'] = selected_df['Year'].apply(convert_to_decade)
# rescale the KI_percentile to relative values by dividing by the mean of KI_percentile in the same decade
selected_df['Relative_KI_percentile'] = selected_df.groupby('Decade')['KI_percentile'].transform(lambda x: x / x.mean())

selected_df['Team_Size'] = selected_df['Team_Size'].apply(lambda x: int(11) if x > 10 else int(x))
selected_df['Team_Distance'] = pd.cut(x = selected_df['Team_Distance'], bins=[0, 100, float('inf')], labels=[0,1], right=True, include_lowest=True) # 0 (onsite): within 100km, 1 (remote): beyond 100km
selected_df['Team_Distance_tag'] = pd.cut(x = selected_df['Team_Distance'], bins=[0, 100, 300, 500, 700, 900, 1100, float('inf')], labels=[0, 200, 400, 600, 800, 1000, 1200], right=True, include_lowest=True) # 0 (onsite): within 100km, 200: 100-300km, 400: 300-500km, 600: 500-700km, 800: 700-900km, 1000: 900-1100km, 1200: beyond 1100km
selected_df['Team_Distance'] = selected_df['Team_Distance'].astype(int)
selected_df['Team_Distance_tag'] = selected_df['Team_Distance_tag'].astype(int)


Plotting

In [None]:
# Plot Figure 3A, 3B, 3C: Conditioning on paper's team size, team distance, and team freshness
nrows, ncols = 2, 3
fig,[axs_row1,axs_row2]=plt.subplots(nrows=nrows, ncols=ncols, figsize=(11.5, 7))
fig.subplots_adjust(hspace=.3*ncols/(ncols_base*1.25),wspace=.3*ncols/(ncols_base*1.25)) # hspace,wspace

condition_list = ['Team_Size','Team_Distance','Team_Freshness']
xlabel_list = ['Team size', 'Team distance (km)', 'Team freshness']
xticks_list = [np.arange(1, 12, 2), range(2), np.arange(0, 4, 1)]
xticklabels_list = [['1', '3', '5', '7', '9', '10+'], ['Onsite','Remote'], np.arange(0, 4, 1)]
legend_tag = ['full',False,False]

for idx in range(len(condition_list)):
    condition = condition_list[idx]

    df_condition = selected_df[['KI_percentile_bin','Relative_KI_percentile','DC_percentile',condition,'Decade']]
    df_condition = df_condition.dropna() # drop rows with NaN values in the condition column


    # Plot the upper row of subplots
    ax_upper = axs_row1[idx]  # Select the corresponding axis for each condition
    palette_upper = sns.color_palette('OrRd', 7)
    sns.lineplot(data=df_condition, x=condition, y="Relative_KI_percentile", hue='Decade', 
                hue_order=['1950s','1960s','1970s','1980s','1990s','2000s','2010s'], 
                palette=palette_upper, dashes=False, legend=legend_tag[idx], ax=ax_upper) # legend='full', marker='o', markersize=5
    # Add a horizontal line at y=1
    ax_upper.axhline(y=1, color='gray', linestyle='--', alpha=0.5)
    ax_upper.set_xlabel(xlabel_list[idx])
    ax_upper.set_xticks(xticks_list[idx])
    ax_upper.set_xticklabels(xticklabels_list[idx], horizontalalignment='center')
    ax_upper.set_ylabel("Relative KI percentile")
    if legend_tag[idx] == 'full':
        ax_upper.legend(title='', title_fontsize=10, loc='best', ncol=1, fontsize=7, 
                shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
    
    # For team distance, we need to add an inset plot to display the detailed distance distribution
    if xlabel_list[idx] == 'Team distance (km)':
        ax_inset = inset_axes(ax_upper, width="76%", height="76%", bbox_to_anchor=(0.08, 0.06, 0.5, 0.5), bbox_transform=ax_upper.transAxes, loc='lower left')
        df_condition = selected_df[['KI_percentile_bin','Relative_KI_percentile','DC_percentile',condition,'Team_Distance_tag','Decade']]
        df_condition = df_condition.dropna() # drop rows with NaN values in the condition column        
        sns.lineplot(data=df_condition, x='Team_Distance_tag', y="Relative_KI_percentile", hue='Decade', 
                    hue_order=['1950s','1960s','1970s','1980s','1990s','2000s','2010s'], 
                    palette=palette_upper, dashes=False, legend=legend_tag[idx], ax=ax_inset) # legend='full', marker='o', markersize=5
        # Add a horizontal line at y=1
        ax_inset.axhline(y=1, color='gray', linestyle='--', alpha=0.5)
        ax_inset.set_ylabel('')
        ax_inset.set_xlabel('')
        ax_inset.set_xticks(np.arange(0, 1300, 200))
        ax_inset.set_xticklabels(['0', '', '', '600', '', '', '1100+'], horizontalalignment='center')
        
        plt.setp(ax_inset.get_xticklabels(), fontsize='x-small')
        ax_inset.tick_params(axis='x', which='both', pad=2)
        plt.setp(ax_inset.get_yticklabels(), fontsize='x-small')
        ax_inset.tick_params(axis='y', which='both', pad=2)
    
    # Add annotations for the upper row of subplots
    ax_upper.annotate(chr(97 + idx).upper(), xy=(-0.2, 1.16), xycoords='axes fraction',
                xytext=(5, -5), textcoords='offset points', ha='left', va='top',
                fontsize=25, fontfamily='Arial', color='black', fontweight='bold')
    

    # Plot the lower row of subplots
    ax_lower = axs_row2[idx]  # Select the corresponding axis for each condition
    palette_lower = sns.color_palette('Blues', 5)
    sns.lineplot(data=df_condition, x=condition, y="DC_percentile", hue='KI_percentile_bin', hue_order=label_rank_list, 
                palette=palette_lower, dashes=False, legend=legend_tag[idx], ax=ax_lower)
    # add fit line for each category
    for category_idx in range(len(label_rank_list)):
        category = label_rank_list[category_idx]
        subset = df_condition[df_condition['KI_percentile_bin'] == category]
        slope, intercept, r_value, p_value, std_err = linregress(subset[condition], subset['DC_percentile'])
        ax_lower.plot(subset[condition], intercept + slope * subset[condition], linestyle='--', color=palette_lower[category_idx])
        x_min, x_max = subset[condition].min(), subset[condition].max()
        text_pos = x_min + x_max*0.2
        ax_lower.text(text_pos, intercept + slope*text_pos + 1, f's={slope:.2f}', color=palette_lower[category_idx], fontsize=8)
    
    ax_lower.set_xlabel(xlabel_list[idx])
    ax_lower.set_xticks(xticks_list[idx])
    ax_lower.set_xticklabels(xticklabels_list[idx], horizontalalignment='center')
    ax_lower.set_ylabel("Disruption percentile")
    if legend_tag[idx] == 'full':
        ax_lower.legend(title="KI percentile", title_fontsize=8, loc='best', ncol=1, fontsize=7, 
                shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)

    # add inset plot for the lower row of subplots
    ax_inset = inset_axes(ax_lower, width="50%", height="50%", bbox_to_anchor=(0.5, 0.005, 0.5, 0.5), bbox_transform=ax_lower.transAxes, loc='lower right')
    sns.lineplot(data=df_condition, x=condition, y="DC_percentile", color="#245297", 
                dashes=False, legend=False, ax=ax_inset)
    # add fit line for the overall categories
    slope, intercept, r_value, p_value, std_err = linregress(df_condition[condition], df_condition['DC_percentile'])
    ax_inset.plot(df_condition[condition], intercept + slope * df_condition[condition], linestyle='--')
    x_min, x_max = df_condition[condition].min(), df_condition[condition].max()
    text_pos = x_min + x_max*0.2
    if xlabel_list[idx] == 'Team size':
        ax_inset.text(text_pos, intercept + slope*text_pos + 1, f's={slope:.2f}', color="#245297", fontsize=6)
    elif xlabel_list[idx] == 'Team distance (km)':
        ax_inset.text(text_pos+0.1, intercept + slope*text_pos + 0.5, f's={slope:.2f}', color="#245297", fontsize=6)
    elif xlabel_list[idx] == 'Team freshness':
        ax_inset.text(x_max*0.1, intercept + slope*(x_max*0.8) + 1, f's={slope:.2f}', color="#245297", fontsize=6)
        
    ax_inset.set_xlabel('')
    ax_inset.set_xlim(x_min-.25,x_max+.25)
    ax_inset.set_xticks([])
    plt.setp(ax_inset.get_xticklabels(), fontsize='x-small')
    ax_inset.tick_params(axis='x', which='both', pad=2)

    ax_inset.set_ylabel('')
    plt.setp(ax_inset.get_yticklabels(), fontsize='x-small')
    ax_inset.tick_params(axis='y', which='both', pad=2)
    
    # Add annotations for the inset plot
    ax_lower.annotate(chr(97 + 3+idx).upper(), xy=(-0.2, 1.16), xycoords='axes fraction',
                xytext=(5, -5), textcoords='offset points', ha='left', va='top',
                fontsize=25, fontfamily='Arial', color='black', fontweight='bold')

plt.show()
fig.savefig('%s/results/figures/Fig3.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)