In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import sys
from scipy.ndimage import uniform_filter1d
import os 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [79]:

# Define phenotypes
phenotypes = ["Cancer", "B cell", "Th", "Tc", "Neutrophils"]
# phenotypes = ["Cancer", "B cell"]

# # Get input arguments
# if len(sys.argv) < 2:
#     print("You must provide an input file")
#     sys.exit(1)
# elif len(sys.argv) == 2:
#     print("Output file not specified. Using default: ./results/step1_1nn_output.tsv")
# else:
#     output_file = sys.argv[2]
output_file = "./results/step1_1nn_output.tsv"

path = os.getcwd()

# output_path = f"{path}/Data/distances/"
input_path = f"{path}/Data/celldata/"
spatial_data_file = f"{input_path}celldata_20240506.csv"

# Load input data
dumdat = pd.read_csv(spatial_data_file)[['Patient_ID', 'celltype','Location_Center_X', 'Location_Center_Y']]
dumdat

Unnamed: 0,Patient_ID,celltype,Location_Center_X,Location_Center_Y
0,LUAD_D001,Cancer,93,159
1,LUAD_D001,Tc,74,186
2,LUAD_D001,Tc,167,131
3,LUAD_D001,Cancer,134,116
4,LUAD_D001,Endothelial cell,142,122
...,...,...,...,...
1640312,LUAD_D416,Cl MAC,281,879
1640313,LUAD_D416,Unknown,385,907
1640314,LUAD_D416,Cancer,363,992
1640315,LUAD_D416,Cancer,357,994


In [81]:

# Function to compute 1-NN distances
def compute_distances(df, phenotype1, phenotype2):
    df1 = df[df['celltype'] == phenotype1][['Location_Center_X', 'Location_Center_Y']]
    df2 = df[df['celltype'] == phenotype2][['Location_Center_X', 'Location_Center_Y']]
    # print(df['Patient_ID'],phenotype1, phenotype2)
    # Check if there are enough samples for computing neighbors
    if len(df1) <= 1 or len(df2) <= 1:
        return np.array([])  # Return an empty array if either phenotype has no samples

    if phenotype1 != phenotype2:
        nbrs = NearestNeighbors(n_neighbors=1).fit(df2)
    else:
        nbrs = NearestNeighbors(n_neighbors=2).fit(df2)
    print(nbrs)
    distances, indices = nbrs.kneighbors(df1)
    if phenotype1 == phenotype2:
        distances = distances[:, 1]  # Use the second nearest neighbor

    return distances.flatten()


In [82]:

# Compute 1-NN distances for all combinations of samples and phenotypes
results = []
for sample_id in dumdat['Patient_ID'].unique():
    sample_data = dumdat[dumdat['Patient_ID'] == sample_id]
    for p1 in phenotypes:
        for p2 in phenotypes:
            # print(sample_data)
            dist = compute_distances(sample_data, p1, p2)
            temp_df = pd.DataFrame({'Patient_ID': sample_id, 'phenotype_from': p1, 'phenotype_to': p2, 'Distance': dist})
            results.append(temp_df)

# Combine all results into a single DataFrame
distances_df = pd.concat(results)
distances_df
# Adjust distances
# Check min distance
# distances_df['Distance'] = (distances_df['Distance'] + 1) 

NearestNeighbors(n_neighbors=2)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=2)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=2)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=2)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=2)
NearestNeighbors(n_neighbors=2)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestNeighbors(n_neighbors=1)
NearestN

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,Distance
0,LUAD_D001,Cancer,Cancer,14.142136
1,LUAD_D001,Cancer,Cancer,9.899495
2,LUAD_D001,Cancer,Cancer,9.219544
3,LUAD_D001,Cancer,Cancer,5.099020
4,LUAD_D001,Cancer,Cancer,7.280110
...,...,...,...,...
17,LUAD_D416,Neutrophils,Neutrophils,9.219544
18,LUAD_D416,Neutrophils,Neutrophils,9.219544
19,LUAD_D416,Neutrophils,Neutrophils,8.062258
20,LUAD_D416,Neutrophils,Neutrophils,116.417353


In [83]:
distances_df['Distance'].min()

0.0

In [84]:
# Define sliding window bins
window_size = 5
bins = np.arange(0, 301)
sliding_bins = np.array([bins[i:i + window_size] for i in range(len(bins) - window_size + 1)])

In [85]:

# Function to count rows within distance bins
def count_rows(x):
    counts = []
    for b in sliding_bins:
        count = np.sum((x >= b.min()) & (x <= b.max()))
        counts.append(count)
    return counts

In [86]:
# Group distances into bins and calculate counts
bin_counts = []
for name, group in distances_df.groupby(['Patient_ID', 'phenotype_from', 'phenotype_to']):
    distances = group['Distance']
    counts = count_rows(distances)
    bin_counts.append(pd.DataFrame({'Patient_ID': name[0], 'phenotype_from': name[1], 'phenotype_to': name[2], 'count': counts, 'bin': np.arange(len(counts))}))

# Combine bin counts into a single DataFrame
bin_counts_df = pd.concat(bin_counts)
bin_counts_df

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,count,bin
0,LUAD_D001,B cell,B cell,0,0
1,LUAD_D001,B cell,B cell,0,1
2,LUAD_D001,B cell,B cell,0,2
3,LUAD_D001,B cell,B cell,2,3
4,LUAD_D001,B cell,B cell,8,4
...,...,...,...,...,...
292,LUAD_D416,Th,Th,0,292
293,LUAD_D416,Th,Th,0,293
294,LUAD_D416,Th,Th,0,294
295,LUAD_D416,Th,Th,0,295


In [87]:
# Calculate the mean of each bin
bin_means = sliding_bins.mean(axis=1)
bin_counts_df['WinMean'] = bin_counts_df['bin'].map(dict(enumerate(bin_means)))
bin_counts_df

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,count,bin,WinMean
0,LUAD_D001,B cell,B cell,0,0,2.0
1,LUAD_D001,B cell,B cell,0,1,3.0
2,LUAD_D001,B cell,B cell,0,2,4.0
3,LUAD_D001,B cell,B cell,2,3,5.0
4,LUAD_D001,B cell,B cell,8,4,6.0
...,...,...,...,...,...,...
292,LUAD_D416,Th,Th,0,292,294.0
293,LUAD_D416,Th,Th,0,293,295.0
294,LUAD_D416,Th,Th,0,294,296.0
295,LUAD_D416,Th,Th,0,295,297.0


In [88]:
group

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,Distance
0,LUAD_D416,Th,Th,20.615528
1,LUAD_D416,Th,Th,5.830952
2,LUAD_D416,Th,Th,22.627417
3,LUAD_D416,Th,Th,38.209946
4,LUAD_D416,Th,Th,8.246211
...,...,...,...,...
462,LUAD_D416,Th,Th,7.071068
463,LUAD_D416,Th,Th,3.605551
464,LUAD_D416,Th,Th,3.605551
465,LUAD_D416,Th,Th,5.830952


In [89]:
# Function to compute area under the curve (AUC)
def compute_auc(x, y):
    order = np.argsort(x)
    auc = np.trapz(y[order], x[order])
    return auc

# Scale counts by AUC
scaled_counts = []
for name, group in bin_counts_df.groupby(['Patient_ID', 'phenotype_from', 'phenotype_to']):
    auc = compute_auc(group['WinMean'], group['count'])
    group['count_scaled'] = group['count'] / auc
    scaled_counts.append(group)

# Combine scaled counts into a single DataFrame
scaled_counts_df = pd.concat(scaled_counts)
scaled_counts_df

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,count,bin,WinMean,count_scaled
0,LUAD_D001,B cell,B cell,0,0,2.0,0.000000
1,LUAD_D001,B cell,B cell,0,1,3.0,0.000000
2,LUAD_D001,B cell,B cell,0,2,4.0,0.000000
3,LUAD_D001,B cell,B cell,2,3,5.0,0.015385
4,LUAD_D001,B cell,B cell,8,4,6.0,0.061538
...,...,...,...,...,...,...,...
292,LUAD_D416,Th,Th,0,292,294.0,0.000000
293,LUAD_D416,Th,Th,0,293,295.0,0.000000
294,LUAD_D416,Th,Th,0,294,296.0,0.000000
295,LUAD_D416,Th,Th,0,295,297.0,0.000000


In [90]:
scaled_counts_df.dropna()

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,count,bin,WinMean,count_scaled
0,LUAD_D001,B cell,B cell,0,0,2.0,0.000000
1,LUAD_D001,B cell,B cell,0,1,3.0,0.000000
2,LUAD_D001,B cell,B cell,0,2,4.0,0.000000
3,LUAD_D001,B cell,B cell,2,3,5.0,0.015385
4,LUAD_D001,B cell,B cell,8,4,6.0,0.061538
...,...,...,...,...,...,...,...
292,LUAD_D416,Th,Th,0,292,294.0,0.000000
293,LUAD_D416,Th,Th,0,293,295.0,0.000000
294,LUAD_D416,Th,Th,0,294,296.0,0.000000
295,LUAD_D416,Th,Th,0,295,297.0,0.000000


In [91]:
scaled_counts_df

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,count,bin,WinMean,count_scaled
0,LUAD_D001,B cell,B cell,0,0,2.0,0.000000
1,LUAD_D001,B cell,B cell,0,1,3.0,0.000000
2,LUAD_D001,B cell,B cell,0,2,4.0,0.000000
3,LUAD_D001,B cell,B cell,2,3,5.0,0.015385
4,LUAD_D001,B cell,B cell,8,4,6.0,0.061538
...,...,...,...,...,...,...,...
292,LUAD_D416,Th,Th,0,292,294.0,0.000000
293,LUAD_D416,Th,Th,0,293,295.0,0.000000
294,LUAD_D416,Th,Th,0,294,296.0,0.000000
295,LUAD_D416,Th,Th,0,295,297.0,0.000000


In [92]:
scaled_counts_df[scaled_counts_df.isna().any(axis=1)]

Unnamed: 0,Patient_ID,phenotype_from,phenotype_to,count,bin,WinMean,count_scaled
0,LUAD_D019,Neutrophils,Neutrophils,0,0,2.0,
1,LUAD_D019,Neutrophils,Neutrophils,0,1,3.0,
2,LUAD_D019,Neutrophils,Neutrophils,0,2,4.0,
3,LUAD_D019,Neutrophils,Neutrophils,0,3,5.0,
4,LUAD_D019,Neutrophils,Neutrophils,0,4,6.0,
...,...,...,...,...,...,...,...
292,LUAD_D411,Tc,B cell,0,292,294.0,
293,LUAD_D411,Tc,B cell,0,293,295.0,
294,LUAD_D411,Tc,B cell,0,294,296.0,
295,LUAD_D411,Tc,B cell,0,295,297.0,


In [93]:
# Save the output
scaled_counts_df['phenotype_combo'] = scaled_counts_df['phenotype_from'] + '_to_' + scaled_counts_df['phenotype_to']
output_file = "./results/step1_1nn_output.csv"

scaled_counts_df.dropna().to_csv(output_file, sep=',', index=False)
