In [None]:
#DATA PROCESSING

#Importing modules
import pandas as pd

#Reading in raw TSV data
#low_memory = False due to mixed data types in columns 9, 11, 12, 13, 15
df = pd.read_csv("/home/leelab/data/TCGA/kipan/kipan_clin_meth_20221210.tsv.gz", sep = "\t", low_memory = False)

#Organizing dataframe
df1 = df.iloc[:,[0,1,2,5,8] + list(range(17, 183218))] #Choosing specific columns
df2 = df1.rename(columns={"Unnamed: 0":"sampleID"}) #Renaming first column

#Re-ordering the stage column to the front
col = df2.pop("stage")
df2.insert(1, col.name, col)

#Removing rows in which "stage" is NaN
df2.dropna(subset = ["stage"], inplace = True)

#Transversing the dataframe and using the sampleID as the index (column name)
df_flipped = df2.set_index("sampleID").T

#Changing the stage value to "normal" for all normal sampleID's
for column in df_flipped.columns:
    if df_flipped.loc["rcc", column] == "normal":
        df_flipped.loc["stage", column] = "normal"

#Extracting columns with each "stage" value
normal = df_flipped.columns[df_flipped.loc["stage"] == "normal"]
stage_i = df_flipped.columns[df_flipped.loc["stage"] == "stage i"]
stage_ii = df_flipped.columns[df_flipped.loc["stage"] == "stage ii"]
stage_iii = df_flipped.columns[df_flipped.loc["stage"] == "stage iii"]
stage_iv = df_flipped.columns[df_flipped.loc["stage"] == "stage iv"]

#Reordering the columns
cols_ordered = list(normal) + list(stage_i) + list(stage_ii) + list(stage_iii) + list(stage_iv)
df_flipped1 = df_flipped[cols_ordered]

#Converting from a "view" df to a "copy" df in order to modify it properly
df_flipped2 = df_flipped1.copy()

In [None]:
#CLUSTERING

#Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz

#Reading in data
df = pd.read_csv("/home/leelab/jbregman/rcc_fcm/processing/processed.csv", index_col = 0, low_memory = False)

#Separating the "mean" columns
df1 = df.iloc[list(range(4,150068)),[839,840,841,842,843]]
df1.columns = ["normal (N = 205)", "stage i (N = 344)", "stage ii (N = 74)", "stage iii (N = 137)", "stage iv (N = 79)"]

#Altering the mean values for each stage by making them relative to the normal value
df2 = pd.DataFrame()
df2["normal (N = 205)"] = df1["normal (N = 205)"] - df1["normal (N = 205)"]
df2["stage i (N = 344)"] = df1["stage i (N = 344)"] - df1["normal (N = 205)"]
df2["stage ii (N = 74)"] = df1["stage ii (N = 74)"] - df1["normal (N = 205)"]
df2["stage iii (N = 137)"] = df1["stage iii (N = 137)"] - df1["normal (N = 205)"]
df2["stage iv (N = 79)"] = df1["stage iv (N = 79)"] - df1["normal (N = 205)"]

#Standardizing the data
df3 = df2.copy()
df3.iloc[:,0] = df3.iloc[:,0]/df1.std(axis=1)
df3.iloc[:,1] = df3.iloc[:,1]/df1.std(axis=1)
df3.iloc[:,2] = df3.iloc[:,2]/df1.std(axis=1)
df3.iloc[:,3] = df3.iloc[:,3]/df1.std(axis=1)
df3.iloc[:,4] = df3.iloc[:,4]/df1.std(axis=1)

#APPLYING FCM CLUSTERING ALGORITHM
m = 1.1 #fuzziness parameter
c =  9 #number of clusters

np.random.seed(19) #random seed
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(df3.T, c, m, error=0.005, maxiter=5000)

#Creating dataframe of the cluster probabilities for each cgID
df_fuzz = pd.DataFrame(u.T, index = df3.index)

#Function to return the index (column name) of the highest value in each row
def find_max(row):
    max_column = row.idxmax()
    return max_column

#Applying that function to the entire dataframe
df_fuzz['Membership'] = df_fuzz.apply(find_max, axis=1)

#Concatenating the dataframes together
df_merged = pd.concat([df3, df_fuzz], axis = 1)

#Sorting the rows so that membership is in increasing order
df_sorted = df_merged.sort_values(by='Membership', key=lambda x: x.map({v: i for i, v in enumerate(list(range(0,c)))}))

df_sorted.to_csv(f"/home/leelab/jbregman/rcc_fcm/clustering/STAGE/csv/NEW_c=s{c}.csv", index = True)

In [None]:
#GRAPHING

#Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize

#Defining number of clusters
num_clusters = 8

#Reading in data
df = pd.read_csv(f"/home/leelab/jbregman/rcc_fcm/clustering/stage/csv/NEW_c=s{num_clusters}.csv", index_col = 0, low_memory = False) #Reading in CSV file
probs = df.iloc[:, list(range(5, 5+num_clusters))] #Extracting probabilities
max = probs.max(axis=1) 
df["max probability"] = max #Adding a max probability column
df_sorted = df.iloc[:,[0,1,2,3,4,num_clusters + 5,num_clusters + 6]] #Removing columns with probability values
df_sorted1 = df_sorted[df_sorted["max probability"] >= 0.99] #Creating a probability cutoff

#DF used for creating charts (replacing cgID index with numbers)
df_charts = df_sorted1.reset_index(drop = True)

#Function to print the necessary dataframes and chart titles for the matrix
def create_dfs(num_clusters):

    #Creating a list "indexes" to store the indexes for each cluster (e.g., the row IDs belonging to each cluster from the df)
    my_list = df_charts["Membership"].tolist()
    my_list_2 = list(range(0, num_clusters))
    indexes = []
    for i in my_list_2:
        indexes.append(my_list.index(i))

    indexes.insert(num_clusters, len(df_charts))

    #Creating a list "length_list" to store the length of each cluster (e.g., the number of rows belonging to each cluster)
    indexes2 = []
    for j in range(len(indexes) - 1):
        sublist = [indexes[j], indexes[j + 1]]
        indexes2.append(sublist)

    length_list = []
    for sublist1 in indexes2:
        diff = sublist1[1] - sublist1[0]
        length_list.append(diff)

    #Using the indexes list & length list to print the code needed for the matrix
    for i in list(range(num_clusters)):
        print(f"df_c{i} = df_charts.iloc[list(range({indexes[i]},{indexes[i+1]})),[0,1,2,3,4,6]]")
        print(f"df_c{i} = df_c{i}.reset_index(drop = True)")

    for i in list(range(num_clusters)):
        print(f"'Cluster {i + 1}/{num_clusters} (m = {length_list[i]})',")

create_dfs(num_clusters)

df_c0 = df_charts.iloc[list(range(0,7629)),[0,1,2,3,4,6]]
df_c0 = df_c0.reset_index(drop = True)
df_c1 = df_charts.iloc[list(range(7629,32862)),[0,1,2,3,4,6]]
df_c1 = df_c1.reset_index(drop = True)
df_c2 = df_charts.iloc[list(range(32862,64397)),[0,1,2,3,4,6]]
df_c2 = df_c2.reset_index(drop = True)
df_c3 = df_charts.iloc[list(range(64397,92641)),[0,1,2,3,4,6]]
df_c3 = df_c3.reset_index(drop = True)
df_c4 = df_charts.iloc[list(range(92641,99154)),[0,1,2,3,4,6]]
df_c4 = df_c4.reset_index(drop = True)
df_c5 = df_charts.iloc[list(range(99154,105996)),[0,1,2,3,4,6]]
df_c5 = df_c5.reset_index(drop = True)
df_c6 = df_charts.iloc[list(range(105996,110367)),[0,1,2,3,4,6]]
df_c6 = df_c6.reset_index(drop = True)
df_c7 = df_charts.iloc[list(range(110367,118935)),[0,1,2,3,4,6]]
df_c7 = df_c7.reset_index(drop = True)

#Creating a list of these dataframes
data_frames = [df_c0, df_c1, df_c2, df_c3, df_c4, df_c5, df_c6, df_c7] #, df_c8, df_c9,
               #df_c10, df_c11, df_c12, df_c13, df_c14, df_c15, df_c16, df_c17, df_c18, df_c19]
               #df_c20, df_c21, df_c22, df_c23, df_c24, df_c25, df_c26, df_c27, df_c28, df_c29,
               #df_c30, df_c31, df_c32, df_c33, df_c34, df_c35, df_c36, df_c37, df_c38, df_c39]

#Customizing a title for each dataframe
titles = [
'Cluster 1/8 (m = 7629)',
'Cluster 2/8 (m = 25233)',
'Cluster 3/8 (m = 31535)',
'Cluster 4/8 (m = 28244)',
'Cluster 5/8 (m = 6513)',
'Cluster 6/8 (m = 6842)',
'Cluster 7/8 (m = 4371)',
'Cluster 8/8 (m = 8568)'
         ]
         
#Creating the matrix (rows,columns) & (width, height)
fig, axes = plt.subplots(3, 3, figsize = (30,25))
fig.suptitle("Stage Cluster (C=8) Line Charts (p >= 0.99)", fontsize = 35) #Adding a title for the matrix

#Looping through each "spot" in the matrix and each dataframe to create the figure
for i, (df, ax, title) in enumerate(zip(data_frames, axes.flat, titles)):
    cmap = plt.get_cmap('plasma')  #Adding a color map
    normalize = Normalize(vmin = 0.99, vmax = 1) #Setting the range of the color map 
    colors = [cmap(normalize(value)) for value in df['max probability']] #Using the probability values for the color map

    #Looping through the dataframe to add the values
    for index, row in df.iterrows():
        #Adding the first 5 values from each row and using the first 5 columns to create the line chart
        ax.plot(df.columns[:5], row.values[:5], label = f'Row {str(index)}', color = colors[index], alpha = 0.25)

    #Adding individual chart details
    ax.set_xlabel('Stage', fontsize=20)
    ax.set_ylabel('Mean Methylation', fontsize=20)
    ax.set_title(title, fontsize=25)
    ax.tick_params(axis = 'x', rotation=90, labelsize=15)
    #ax.set_ylim(df.loc[:, df.columns != 'max probability'].values.min() - 0.3, df.loc[:, df.columns != 'max probability'].values.max() + 0.3)
    ax.set_ylim(-3,3)
    
#Creates one color bar for entire matrix
sm = ScalarMappable(cmap=cmap, norm=normalize)
sm.set_array([])
cbar = plt.colorbar(sm, label = 'Cluster Probability', cax = fig.add_axes([0.95, 0.15, 0.02, 0.7]))

plt.subplots_adjust(top = 0.9, wspace = 0.4, hspace = 0.7) 
plt.show()