In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
import seaborn as sns
import matplotlib as mpl
from matplotlib.lines import Line2D
import yaml
from scipy import stats

import sys
sys.path.append('../resources/')
from ImagingUtilities import *

import warnings
warnings.filterwarnings('ignore')

from scipy.optimize import curve_fit
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_log_error, r2_score

with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))

In [None]:
line_palette = {
 'CTL01A': '#DBB807',
    'CTL08A': '#0FB248',
    'CTL04E': '#FF0054',
    'CTL02A': '#7B00FF',
'H9': '#72190E',
 'H1': '#994F88',
 'CTL05A': '#1965B0',
 'CTL07C': '#437DBF',
 'CTL06F': '#CAE0AB',
 'CTL09A': '#FFFF00',
 'KTD8.2': '#E65518',
 'UCSFi001-A': '#7BAFDE'}

# Load data

In [None]:
total_df = pd.read_csv('../../iPSC_imaging/quantifications/quantification.csv', index_col=0)
add_tp = pd.read_csv('../../iPSC_imaging/quantifications/quantification_addTP.csv', index_col=0)
total_df[ ~ total_df.duplicated()]
total_df = pd.concat([total_df, add_tp])

donor_map_names = {i:j for i, j in zip(total_df['line'], total_df['line'])}
donor_map_names['CHD2WT'] = 'UCSFi001-A'
donor_map_names['CHD8WT'] = 'H9'
total_df['line'] = total_df['line'].map(donor_map_names)
total_df.shape

In [None]:
total_df['pixel_size'] = 1.38

In [None]:
total_df['Area (microm2)'] = total_df.total_area * 1.38
total_df['Area (mm2)'] = total_df['Area (microm2)'] / 1000

In [None]:
total_df['line'].unique()

In [None]:
all_tp = total_df.time_point.unique()
all_tp.sort()
all_tp

Here I'm adding a few quantifications:

1. `norm_factor`: the normalization factor, corresponding to the mean area of the area percentage of all lines for each time point post split (called `split_time`)
2. `perc_area_norm`: the normalized percentage area, corresponding to the percentage area divided by the the normalization factor (1.)
3. `mean_area_tp`: the mean area of each line at each time point post split (called `split_time`)
4. `area_error`: the percentage "error" of the total area computed with respect to the mean of that line at that time point
5. `std`: the standard deviation of each area with respect to (3.)
6. `cv`: the [coefficient of variation](https://en.wikipedia.org/wiki/Coefficient_of_variation), corresponding to the ratio between the standard deviation and the mean

In [None]:
total_df['line_split'] = total_df['line'].astype('str') + '_' + total_df['split_time'].astype('str')

mean_df_time_point = total_df.groupby(['split_time']).mean('perc_area')
mean_df_time_point_dict = {i:j for i, j in zip(mean_df_time_point.index, mean_df_time_point.perc_area)}
mean_df_time_point_dict

area_df_time_point = total_df.groupby(['line','split_time']).mean('Area (microm2)').reset_index()
area_df_time_point['line_split'] = area_df_time_point['line'].astype('str') + '_' + area_df_time_point['split_time'].astype('str')
area_df_time_point = {i:j for i, j in zip(area_df_time_point.line_split, area_df_time_point['Area (microm2)'])}
area_df_time_point

total_df['norm_factor'] = total_df.split_time.map(mean_df_time_point_dict)
total_df['perc_area_norm'] = total_df['perc_area'] / total_df['norm_factor']

total_df['mean_area_tp'] = total_df.line_split.map(area_df_time_point)
total_df['area_error'] = (total_df['mean_area_tp'] - total_df['Area (microm2)']) / total_df['Area (microm2)']
total_df['std'] = np.sqrt((total_df['Area (microm2)'] - total_df['mean_area_tp'])**2 / len(total_df))
total_df['cv'] = total_df['std'] / total_df['mean_area_tp']

In [None]:
total_df['line_n_split'] = total_df['line'] + '_' + total_df['n_split'].astype('str')

In [None]:
sns.kdeplot(total_df['cv'])

In [None]:
ax = sns.kdeplot(total_df['area_error'])
ax.axvline(15)

In [None]:
ax = sns.kdeplot(total_df['area_error'])
ax.set_xlim(-1, 30)
ax.axvline(10)

In [None]:
fig, ax = plt.subplots(figsize = (20, 10))
sns.scatterplot(data = total_df, y = 'perc_area', x = 'split_time', ax = ax, hue = 'line', palette=line_palette)

In [None]:
total_df = total_df[~((total_df['split_time'] < 25) & (total_df['perc_area'] > 10))]

In [None]:
ax = sns.kdeplot(np.log10(total_df['area_error']))

In [None]:
total_df = total_df[total_df['area_error'] < 5]
total_df.shape

In [None]:
idx_max = total_df.groupby('line')['n_split'].idxmax()

# Filter the DataFrame using these indices
filtered_df = total_df.drop(idx_max)
filtered_df

In [None]:
total_df['logArea'] = np.log10(total_df['Area (microm2)'] + 0.000001)

# Growth curves total area - by line
Here we fitted a polynomial regression function of order 3 (exploratory to look at what type of shapes we expect from the curves):

In [None]:
order = total_df.line.unique().tolist()
order.sort()

In [None]:
sns.set_theme(style="ticks")

# Initialize a grid of plots with an Axes for each walk
grid = sns.FacetGrid(total_df.sort_values(by = 'split_time'), col="line", hue = 'line', palette=line_palette,
                     col_wrap=4, height=5, col_order = order)


# Draw a line plot to show the trajectory of each random walk
grid.map(sns.regplot, "split_time", "Area (mm2)", order = 3)

grid.set_axis_labels("Time point post split", "Area (mm2)")

# Adjust the arrangement of the plots
grid.fig.tight_layout(w_pad=1)

Without fitting any regression (line goes through the mean and the highligthed data around is the standard deviation):

In [None]:
sns.set_theme(style="ticks")

# Initialize a grid of plots with an Axes for each walk
grid = sns.FacetGrid(total_df.sort_values(by = 'split_time'), col="line", hue = 'line', palette=line_palette,
                     col_wrap=4, height=5, col_order = order)

# Draw a line plot to show the trajectory of each random walk
grid.map(sns.lineplot, "split_time", "Area (mm2)", markers = True)

grid.set_axis_labels("Time point post split", "Total Area (pixels)")

# Adjust the arrangement of the plots
grid.fig.tight_layout(w_pad=1)

We use the __area__ then average all the FOV for a specific time points in each line. The plot is composed by:
* a solid blue line, that is the smoothed version of this growth curve (using the function [`gaussian_filter1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter1d.html) from numpy), 
* the red around that it is the standard deviation
* the dotted grey line that is the original signal 

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(30, 21), gridspec_kw={'hspace': 0.7})
ax = ax.flatten()

for ax_index, line in enumerate(total_df.line.unique()):
    sub = total_df[(total_df.line == line) & (total_df.n_split != 'day')].sort_values(by='datetime')
    mean_st = sub.groupby('split_time')['Area (mm2)'].mean()
    std_st = sub.groupby('split_time')['Area (mm2)'].std()
    y_pos = mean_st.index
    ydata = sub['Area (microm2)'].values
    xdata = sub.split_time.values.astype('int')
    farray = mean_st.values

    # Smoothing
    farray_smooth = gaussian_filter1d(farray, sigma=3)

    # Error formatting
    upper_err = gaussian_filter1d(farray + (std_st / 2).values, sigma=3)
    lower_err = gaussian_filter1d(farray - (std_st / 2).values, sigma=3)

    ax[ax_index].scatter(xdata, ydata)
    ax[ax_index].plot(y_pos, farray, '--', linewidth=0.7, color='k', alpha=0.45)
    ax[ax_index].plot(y_pos, farray_smooth, color='#2374AB')
    ax[ax_index].fill_between(y_pos, upper_err, lower_err, color='crimson', alpha=0.2)

    # Use meaningful limits for better visualization
    ax[ax_index].set_ylim(0, np.max(farray) * 1.25)

    # Rotate x-axis labels for better readability
    ax[ax_index].tick_params(axis='x', rotation=90)

    # Set axis labels and title
    ax[ax_index].set_title(f'Line {line}')
    ax[ax_index].set_ylabel('Total area')
    ax[ax_index].set_xlabel('Time point after split')

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

# Growth curves total area - by line and split
Here we fitted a polynomial regression function of order 3:

In [None]:
sns.set_theme(style="ticks")

split_palette = {'1': '#264653', '2': '#2a9d8f', '3': '#8ab17d', '4': '#e9c46a', '5': '#f4a261', '6': '#e76f51'}
line_split_palette = {}

for i in total_df.line_n_split.unique():
    split_n = i.split('_')[-1]
    line_split_palette[i] = split_palette[split_n]

custom_handles = [Line2D([0], [0], color=color, lw=2) for color in split_palette.values()]
grid = sns.FacetGrid(total_df.sort_values(by = 'split_time'), col="line", hue = 'line_n_split', palette=line_split_palette,
                     col_wrap=4, height=5, col_order = order)
grid.map(sns.lineplot, "split_time", "Area (mm2)", markers = True)

grid.add_legend()
if grid._legend:
    grid._legend.remove()  

legend = grid.fig.legend(custom_handles, split_palette.keys(), ncol=2, frameon=False, bbox_to_anchor = (1.2,1), fontsize = 25)

legend.set_title('Passage number', prop={'size': 30})

for ax in grid.axes.flat:
    ax.set_title(ax.get_title(), fontsize=35)

# Increase the size of x and y tick labels
for ax in grid.axes.flat:
    ax.tick_params(axis='x', labelsize=20)
    ax.tick_params(axis='y', labelsize=20)

grid.set_axis_labels("Hours after splitting", "Total area (mm2)", fontsize = 25)
grid.fig.tight_layout(w_pad=1)
grid.fig.savefig('./figures/raw_GC_iPSC_dividedSplit.svg', dpi = 300, bbox_inches = 'tight')

I need to filter out the combination of "line" - "number of split" that do not have enough data point to fit an order 3 polynomial regression:

In [None]:
sns.set_theme(style="ticks")

boolean_sel = pd.Series(total_df.groupby(['line_n_split'])['split_time'].count() > 5)
boolean_sel = boolean_sel[boolean_sel]
filtered_total = total_df[total_df.line_n_split.isin(boolean_sel.index)]

# Initialize a grid of plots with an Axes for each walk
grid = sns.FacetGrid(filtered_total.sort_values(by = 'split_time'), col="line_n_split", hue = 'line', palette=line_palette,
                     col_wrap=4, height=5, col_order = order)

grid.map(sns.regplot, "split_time", "Area (microm2)", order = 3)

# Adjust the arrangement of the plots
grid.fig.tight_layout(w_pad=1)

We use the __area__ then average all the FOV for a specific time points in each line at each split. The plot is composed by:
* a solid blue line, that is the smoothed version of this growth curve (using the function [`gaussian_filter1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter1d.html) from numpy), 
* the red around that it is the standard deviation
* the dotted grey line that is the original signal 

In [None]:
fig, ax = plt.subplots(9,5, figsize = (5*10, 7*9), gridspec_kw={'hspace': 0.7})
ax = ax.flatten().T
ax_index = 0
summary_dfs_dict = {}

for l in order:

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    splits = sub.n_split.unique().tolist()
    splits.sort()

    for split in splits:

        subsub = sub[sub.n_split == split]

        if len(subsub.split_time.unique()) > 5:
            
            ydata = subsub['Area (microm2)'].values
            xdata = subsub.split_time.values.astype('int')
    
            
            # user defined function,
            # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
            summary_df = preprocess(subsub, original_v='Area (microm2)', final_output='mean')

            summary_dfs_dict[f'{l}_split_{split}'] = summary_df
            
            farray = np.array(summary_df['mean'])
    
            y_pos = summary_df.split_time.values
    
            # Smoothing
            farray_smooth = gaussian_filter1d(farray, sigma=3)
    
            # Error formatting
            upper_err = gaussian_filter1d(farray + (summary_df['stds'] / 2), sigma=3)
            lower_err = gaussian_filter1d(farray - (summary_df['stds'] / 2), sigma=3)
    
            ax[ax_index].scatter(xdata, ydata)
            ax[ax_index].plot(y_pos, farray, '--', linewidth=0.7, color='k', alpha=0.45)
            ax[ax_index].plot(y_pos, farray_smooth, color = '#2374AB')
            ax[ax_index].fill_between(y_pos, upper_err, lower_err, color='crimson', alpha=0.2)
            #ax[ax_index].errorbar(y_pos, farray, yerr=summary_df['stds'], fmt='none', color='crimson', alpha=0.5)
            #
            ax[ax_index].set_ylim(0, np.max(farray)+(np.max((farray)*25)/100))
            ax[ax_index].xaxis.set_tick_params(rotation=90)
            ax[ax_index].set_title(f'{l}_split_{split}')
            ax[ax_index].set_ylabel('Total area')
            ax[ax_index].set_xlabel('Time point')
            ax_index += 1
            
        else:
            print(f'Skipped split {subsub.n_split.values[0]} of line {subsub.line.values[0]}')

plt.show()

# Growth curves log total area

We use the __logarithm of the area__ then sum all the FOV for a specific time points in each line at each split. The plot is composed by:
* a solid blue line, that is the smoothed version of this growth curve (using the function [`gaussian_filter1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter1d.html) from numpy), 
* the red around that it is the standard deviation
* the dotted grey line that is the original signal 

In [None]:
fig, ax = plt.subplots(10,5, figsize = (5*10, 7*9), gridspec_kw={'hspace': 0.7})
ax = ax.flatten().T
ax_index = 0

for l in order:

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    splits = sub.n_split.unique().tolist()
    splits.sort()

    for split in splits:

        subsub = sub[sub.n_split == split]

        
        # user defined function,
        # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
        summary_df = preprocess(subsub, original_v='logArea', final_output='area_sum')
        
        farray = np.array(summary_df['area_sum'])

        y_pos = summary_df.datetime.values

        # Smoothing
        farray_smooth = gaussian_filter1d(farray, sigma=1)

        # Error formatting
        upper_err = gaussian_filter1d(farray + (summary_df['stds'] / 2), sigma=1)
        lower_err = gaussian_filter1d(farray - (summary_df['stds'] / 2), sigma=1)

        ax[ax_index].plot(y_pos, farray, '--', linewidth=0.7, color='k', alpha=0.45)
        ax[ax_index].plot(y_pos, farray_smooth, color = '#2374AB')
        ax[ax_index].fill_between(y_pos, upper_err, lower_err, color='crimson', alpha=0.2)
        #
        ax[ax_index].set_ylim(0, np.max(farray)+(np.max((farray)*25)/100))
        ax[ax_index].xaxis.set_tick_params(rotation=90)
        ax[ax_index].set_title(f'{l}_split_{split}')
        ax[ax_index].set_ylabel('Log total area')
        ax[ax_index].set_xlabel('Time point')
        ax_index += 1

plt.show()

In [None]:
lines = total_df.line.unique()

# Discrete derivative of the curves - per line per split

We use the total area and then sum all the FOV for a specific time points in each line at each split. Then we smoothed it with [`gaussian_filter1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter1d.html) and computed the [diff](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.diff.html), corresponding to the first discrete derivative.

In [None]:
discrete_deriv_curves = {}

fig, ax = plt.subplots(10,5, figsize = (5*10, 7*9), gridspec_kw={'hspace': 0.7})
ax = ax.flatten().T
ax_index = 0

for l in total_df.line.unique():
    
    #color = color_dict[l]

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    for split in sub.n_split.unique():

        subsub = sub[sub.n_split == split]

        # user defined function,
        # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
        summary_df = preprocess(subsub, original_v='Area (microm2)', final_output='mean')
        
        summary_df = summary_df.sort_values('split_time')
        
        summary_df['smoothed'] = gaussian_filter1d(summary_df['mean'], 3)
        summary_df['derivative'] = summary_df['smoothed'].diff() / summary_df['split_time'].diff()

        discrete_deriv_curves[f'{l}_split_{split}'] = summary_df
        
        farray = np.array(summary_df['derivative'])
        
        y_pos = summary_df.split_time.values
        
        ax[ax_index].errorbar(y_pos, farray, marker = 'o')
        
        ax[ax_index]

        ax[ax_index].xaxis.set_tick_params(rotation=90)
        ax[ax_index].set_title(f'{l}_split_{split}')
        ax[ax_index].set_ylabel('Discrete derivative')
        ax[ax_index].set_xlabel('Time point')
        ax_index += 1
        

plt.show()

# Discrete derivative of the curves - per line

We collected the results for each line and each split and we can use it to converge to a single result for each line using as replicates the splits.

In [None]:
deriv_df = pd.concat(discrete_deriv_curves.values(), keys = discrete_deriv_curves.keys()).reset_index()
deriv_df['line'] = deriv_df['level_0'].apply(lambda x: x.split('_')[0])
deriv_df['split'] = deriv_df['level_0'].apply(lambda x: x.split('_')[-1])

In [None]:
fig, ax = plt.subplots(4,3, figsize = (5*4, 7*3), gridspec_kw={'hspace': 0.7})
ax = ax.flatten().T
ax_index = 0

for line in deriv_df.line.unique():
    
    #color = color_dict[l]

    sub = deriv_df[deriv_df.line == line]
    
    sub = sub.sort_values('split_time')

    sub['smoothed'] = gaussian_filter1d(sub['derivative'], sigma = 3)

    sns.lineplot(data = sub, y = 'smoothed', x = 'split_time', hue = 'split', ax = ax[ax_index], errorbar='sd', markers = True, palette=split_palette)
    ax[ax_index].xaxis.set_tick_params(rotation=90)
    ax[ax_index].set_title(f'{line}')
    ax[ax_index].set_ylabel('Discrete derivative')
    ax[ax_index].set_xlabel('Time point')
    ax_index += 1

In [None]:
fig, ax = plt.subplots(4,3, figsize = (5*4, 7*3)) #, gridspec_kw={'hspace': 0.7})
ax = ax.flatten().T
ax_index = 0

for line in deriv_df.line.unique():
    
    #color = color_dict[l]

    sub = deriv_df[deriv_df.line == line]
    
    sub = sub.sort_values('split_time')

    sub['smoothed'] = gaussian_filter1d(sub['derivative'], sigma = 3)

    sns.lineplot(data = sub, y = 'smoothed', x = 'split_time', ax = ax[ax_index], errorbar='sd', markers = True, err_style = 'bars')
    ax[ax_index].xaxis.set_tick_params(rotation=90)
    ax[ax_index].set_title(f'{line}')
    ax[ax_index].set_ylabel('Discrete derivative')
    ax[ax_index].set_xlabel('Time point')
    ax_index += 1
    
plt.tight_layout()

# Cumulative  of the areas - per line per split

We use the total area and then average all the FOV for a specific time points in each line at each split. Then we smoothed it with [`gaussian_filter1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter1d.html) and computed the cumulative sum over the discrete differential of the growth. with the [`cumsum()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.cumsum.html) function.

In [None]:
fig, ax = plt.subplots(9,5, figsize = (5*10, 7*9))
ax = ax.flatten().T
ax_index = 0

order_line_n = total_df.line_n_split.unique().tolist()
order_line_n.sort()

for line_n in order_line_n:
    
    sub = total_df[total_df.line_n_split == line_n]
    #sub = sub[~sub.index.duplicated()]
    sub = sub.sort_values('split_time')

    if len(sub.split_time.unique()) > 5:
        
        #sub['smoothed'] = sub.groupby('line_n_split')['Area (microm2)'].apply(gaussian_filter1d, sigma = 3).loc[line_n]
        y = sub.groupby('split_time')['Area (mm2)'].mean().cumsum().values
        x = sub.split_time.unique()
        
        sns.lineplot(y = y, x = x, ax = ax[ax_index], errorbar='sd', markers = True, err_style='bars')
        ax[ax_index].xaxis.set_tick_params(rotation=90)
        ax[ax_index].set_title(f'{line_n}')
        ax[ax_index].set_ylabel('Cumulative of mean total area')
        ax[ax_index].set_xlabel('Time point')
        ax_index += 1

plt.tight_layout()

In [None]:
sns.set(style="white", palette="Paired", color_codes=True)
fig, ax = plt.subplots(figsize=(7,5))
labels = []
lc = []
handles = []
all_lines = {}

total_df_no_first = total_df[total_df.n_split != '1'].copy()

for l in total_df.line.unique():
    
    color = line_palette[l]

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    for split in sub.n_split.unique():

        subsub = sub[sub.n_split == split]

        # user defined function,
        # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
        summary_df = preprocess(subsub, original_v='Area (mm2)', final_output='mean')
        
        summary_df = summary_df.sort_values('split_time')
        
        summary_df['smoothed'] = gaussian_filter1d(summary_df['mean'], 1)
        summary_df['cumulative'] = summary_df['smoothed'].cumsum()
        
        farray = np.array(summary_df['cumulative'])
        
        y_pos = summary_df.split_time.values

        line, = ax.plot(y_pos, farray, color = color, marker = '.')
        ax.xaxis.set_tick_params(rotation=90)
        all_lines[f'{l}_{split}'] = line

    labels.append(l)
    handles.append(line)
    lc.append(color)
    
plt.legend(handles, labels, bbox_to_anchor = (1,1))

In [None]:
lines = total_df.line.unique()

In [None]:
highlight_growth_curves(all_lines, xlabel = 'Hours from split', ylabel = 'Cumulative growth', lines = lines, fontsize = 20)
#plt.savefig('growth_curve_per_line.pdf', dpi = 300)

In [None]:
fig, ax = plt.subplots(5,9, figsize = (5*12, 8*5))
ax = ax.flatten().T
ax_index = 0

fitted_param = {}

def exp_model(t, a, b):
    return a * np.exp(b * t)

for line_n in order_line_n:

    fitted_param[line_n] = {}
    sub = total_df[total_df.line_n_split == line_n]
    sub = sub.sort_values('split_time')
    
    if len(sub.split_time.unique()) > 5:
        # Calculate the cumulative sum of the mean total area for each split_time
        y = sub.groupby('split_time')['Area (mm2)'].mean().cumsum().values
        x = np.array(sub.split_time.unique())
        hue = [line_n.split('_')[0]]*len(y)

        try:
            popt, pcov = curve_fit(exp_model, x, y, p0=(max(y), 0.1))

            fitted_param[line_n]['a'] = popt[0]
            fitted_param[line_n]['rate'] = popt[1]
            fitted_param[line_n]['mean_cum'] = y
            fitted_param[line_n]['split_time'] = x
            
            a, b = popt
            
            # Generate fitted y values
            y_fitted = exp_model(x, a, b)

            fitted_param[line_n]['y_fitted'] = y_fitted
            fitted_param[line_n]['MSLE'] = mean_squared_log_error(y, y_fitted)
            fitted_param[line_n]['r2'] = r2_score(y, y_fitted)
            
            sns.lineplot(y = y_fitted, x = x, ax = ax[ax_index], markers = True, hue = hue, palette=line_palette, linewidth = 5, legend = None)
            sns.scatterplot(y = y, x = x, ax = ax[ax_index], markers = True, hue = hue, palette=line_palette, s = 150, legend=None)
            ax[ax_index].xaxis.set_tick_params(rotation=90)
            ax[ax_index].set_title(f'{line_n}', fontsize = 40)
            ax[ax_index].set_ylabel('Cumulative area (mm2)', fontsize = 35)
            ax[ax_index].set_xlabel('Time point', fontsize = 35)
            _ = ax[ax_index].set_xticklabels(ax[ax_index].get_xticklabels(), fontsize = 30)
            _ = ax[ax_index].set_yticklabels(ax[ax_index].get_yticklabels(), fontsize = 30)
            ax_index += 1
        
        except RuntimeError as e:
            print(f"Fitting failed for line_n {line_n}: {e}")
            
plt.tight_layout()
plt.savefig('./figures/Fitted_cum_area_per_split.svg', dpi = 300, bbox_inches = 'tight')

In [None]:
sns.set(style="white", palette="Paired", color_codes=True)
fig, ax = plt.subplots(figsize=(7,5))
labels = []
lc = []
handles = []
all_lines = {}

total_df_no_first = total_df[total_df.n_split != '1'].copy()

for l in total_df.line.unique():
    
    color = line_palette[l]

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    for split in sub.n_split.unique():


        subsub = sub[sub.n_split == split]

        if len(subsub.split_time.unique()) > 5:

            # user defined function,
            # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
            summary_df = preprocess(subsub, original_v='Area (mm2)', final_output='mean')
            
            summary_df = summary_df.sort_values('split_time')
            summary_df['cumulative'] = summary_df['mean'].cumsum()
            
            farray = np.array(summary_df['cumulative'].values)
            y_pos = summary_df.split_time.values
    
            popt, pcov = curve_fit(exp_model, y_pos, farray, p0=(max(farray), 0.1))
            
            a, b = popt
    
            y_fitted = exp_model(y_pos, a, b)
    
            line, = ax.plot(y_pos, y_fitted, color = color, marker = '.')
            ax.xaxis.set_tick_params(rotation=90)
            all_lines[f'{l}_{split}'] = line
    
        labels.append(l)
        handles.append(line)
        lc.append(color)
    
plt.legend(handles, labels, bbox_to_anchor = (1,1))

In [None]:
lines = total_df.line.unique()

In [None]:
highlight_growth_curves(all_lines, xlabel = 'Hours from split', ylabel = 'Cumulative area (mm2)', lines = lines, fontsize = 20)
plt.savefig('./figures/cumulative_growth_curve_per_line.svg', dpi = 300)

In [None]:
fitted_df = pd.DataFrame.from_dict(fitted_param).T
fitted_df['Line'] = fitted_df.reset_index()['index'].apply(lambda x: x.split('_')[0]).values
fitted_df['split'] = fitted_df.reset_index()['index'].apply(lambda x: x.split('_')[1]).values
fitted_df = fitted_df[~fitted_df.a.isna()]
#fitted_df

In [None]:
fitted_df_filtered = fitted_df[fitted_df['r2'] > 0.9]
fitted_df_filtered

In [None]:
fig, ax = plt.subplots()
sns.barplot(data = fitted_df_filtered, x = 'split', y = 'rate', order = ['1', '2', '3', '4', '5', '6'], ax = ax)#, hue = 'Line', palette=line_palette)
_ = ax.set_ylabel('Growth rate', fontsize = 20)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 15)
_ = ax.set_xlabel('Passage', fontsize = 20)
_ = ax.set_xticklabels(ax.get_xticklabels(), fontsize = 15)

In [None]:
fig, ax = plt.subplots(figsize = (10,4))
fitted_df_filtered['split'] = fitted_df_filtered['split'].astype('int')
sns.lineplot(data = fitted_df_filtered, x = 'split', y = 'rate', hue = 'Line',
              ax = ax, palette=line_palette)
_ = ax.set_ylabel('Growth rate', fontsize = 20)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 15)
_ = ax.set_xlabel('Passage', fontsize = 20)
_ = ax.set_xticklabels(ax.get_xticklabels(), fontsize = 15)
ax.legend(bbox_to_anchor = (1,1))

In [None]:
fig, ax = plt.subplots()
sns.boxplot(data = fitted_df_filtered, x = 'Line', y = 'rate', ax = ax, palette=line_palette)
_ = ax.set_ylabel('Rate of area growth', fontsize = 20)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 15)
_ = ax.set_xlabel('Passage', fontsize = 20)
_ = ax.set_xticklabels(ax.get_xticklabels(), fontsize = 15, rotation = 90)

In [None]:
fitted_df_filtered.sort_values(by = 'rate').to_csv('../../data/csv/iPSC_fitted_exp_area_sum_per_split.csv')

In [None]:
fitted_df_filtered.sort_values(by = 'rate')

In [None]:
pd.DataFrame(fitted_df.groupby('Line')['rate'].mean().sort_values())

In [None]:
data_tot = []

for line_n in fitted_df.index:
    sub = fitted_df.loc[line_n]
    data = pd.DataFrame({'mean_cum': sub['mean_cum'], 'split_time': sub['split_time'], 'line_n': [line_n] * len(sub['mean_cum'])})
    data_tot.append(data)

data_tot = pd.concat(data_tot)

data_tot['Line'] = data_tot['line_n'].apply(lambda x: x.split('_')[0]).values
data_tot['split'] = data_tot['line_n'].apply(lambda x: x.split('_')[1]).values

In [None]:
fig, ax = plt.subplots(4,3, figsize = (20, 20))
ax = ax.flatten().T
ax_index = 0
fitted_param_line = {}

for line in order:

    fitted_param_line[line] = {}
    sub = data_tot[data_tot.Line == line]
    sub = sub.sort_values('split_time')
    
    if len(sub.split_time.unique()) > 5:
        # Calculate the cumulative sum of the mean total area for each split_time
        y = sub['mean_cum'].cumsum().values
        x = np.array(sub.split_time)

        try:
            popt, pcov = curve_fit(exp_model, x, y, p0=(max(y), 0.1))

            fitted_param_line[line]['intercept'] = popt[0]
            fitted_param_line[line]['rate'] = popt[1]
            fitted_param_line[line]['mean_cum'] = y
            fitted_param_line[line]['split_time'] = x
            
            a, b = popt
            
            # Generate fitted y values
            y_fitted = exp_model(x, a, b)

            fitted_param_line[line]['y_fitted'] = y_fitted
            fitted_param_line[line]['MSLE'] = mean_squared_log_error(y, y_fitted)
            fitted_param_line[line]['r2'] = r2_score(y, y_fitted)
            
            sns.lineplot(y = y_fitted, x = x, ax = ax[ax_index], markers = True)
            sns.scatterplot(y = y, x = x, ax = ax[ax_index], markers = True)
            ax[ax_index].xaxis.set_tick_params(rotation=90)
            ax[ax_index].set_title(f'{line}', fontsize = 30)
            ax[ax_index].set_ylabel('Increase in area ', fontsize = 20)
            ax[ax_index].set_xlabel('Time point', fontsize = 20)
            _ = ax[ax_index].set_xticklabels(ax[ax_index].get_xticklabels(), fontsize = 15)
            _ = ax[ax_index].set_yticklabels(ax[ax_index].get_yticklabels(), fontsize = 15)
            ax_index += 1
        
        except RuntimeError as e:
            print(f"Fitting failed for line_n {line_n}: {e}")
            
plt.tight_layout()

In [None]:
fitted_df_line = pd.DataFrame.from_dict(fitted_param_line).T
fitted_df_line = fitted_df_line[~fitted_df_line.intercept.isna()]
fitted_df_line.sort_values('rate')

In [None]:
fitted_df.r2.mean()

In [None]:
fitted_df_filtered.r2.mean()

In [None]:
fitted_df_line.r2.mean()

# Cumulative sum of the discrete derivative of the curves - per line per split

We use the logarithm of the area and then sum all the FOV for a specific time points in each line at each split. Then we smoothed it with [`gaussian_filter1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.gaussian_filter1d.html) and computed the [diff](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.diff.html), corresponding to the first discrete derivative. We then use the [`cumsum()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.cumsum.html) function to obtain the cumulative sum over the discrete differential of the growth.

In [None]:
fig, ax = plt.subplots(10,5, figsize = (5*10, 7*9), gridspec_kw={'hspace': 0.7})
ax = ax.flatten().T
ax_index = 0

for l in total_df.line.unique():
    
    #color = color_dict[l]

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    for split in sub.n_split.unique():

        subsub = sub[sub.n_split == split]

        
        # user defined function,
        # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
        summary_df = preprocess(subsub, original_v='Area (microm2)', final_output='mean')
        
        summary_df = summary_df.sort_values('split_time')
        
        summary_df['smoothed'] = gaussian_filter1d(summary_df['mean'], 3)
        summary_df['derivative'] = summary_df['smoothed'].diff() / summary_df['split_time'].diff()
        summary_df['cumulative'] = summary_df['derivative'].cumsum()
        
        farray = np.array(summary_df['cumulative'])
        
        y_pos = summary_df.split_time.values
        
        ax[ax_index].errorbar(y_pos, farray, marker = 'o')

        ax[ax_index].xaxis.set_tick_params(rotation=90)
        ax[ax_index].set_title(f'{l}_split_{split}')
        ax[ax_index].set_ylabel('Cumulative growth')
        ax[ax_index].set_xlabel('Time point')
        ax_index += 1
        
plt.show()

In [None]:
total_df.columns

In [None]:
sns.set(style="white", palette="Paired", color_codes=True)
fig, ax = plt.subplots(figsize=(7,5))
labels = []
lc = []
handles = []
all_lines = {}

total_df_no_first = total_df[total_df.n_split != '1'].copy()

cumulative_dict_dfs = {}

for l in total_df.line.unique():
    
    color = line_palette[l]

    sub = total_df[total_df.line == l]
    sub = sub.sort_values(by = 'datetime')
    sub = sub[sub.n_split != 'day']

    for split in sub.n_split.unique():


        subsub = sub[sub.n_split == split]

        if len(subsub.split_time.unique()) > 0:
    
            # user defined function,
            # with `area_sum` return the sum of the areas of all the field of view captured for that line at that time point
            summary_df = preprocess(subsub, original_v='Area (microm2)', final_output='mean')
            
            summary_df = summary_df.sort_values('split_time')
            
            summary_df['smoothed'] = gaussian_filter1d(summary_df['mean'], 3)
            summary_df['derivative'] = summary_df['smoothed'].diff() / summary_df['split_time'].diff()
            summary_df['cumulative'] = summary_df['derivative'].cumsum()
    
            cumulative_dict_dfs[f'{l}_{split}'] = summary_df
            
            farray = np.array(summary_df['cumulative'])
            
            y_pos = summary_df.split_time.values
    
            line, = ax.plot(y_pos, farray, color = color, marker = '.')
            ax.xaxis.set_tick_params(rotation=90)
            all_lines[f'{l}_{split}'] = line
    
            labels.append(l)
            handles.append(line)
            lc.append(color)
    
plt.legend(handles, labels, bbox_to_anchor = (1,1))

In [None]:
lines = total_df.line.unique()

In [None]:
highlight_growth_curves(all_lines, 
                        xlabel = 'Hours from split', 
                        ylabel = 'Cumulative growth', 
                        lines = lines, fontsize = 20)
plt.savefig('./figures/cumulative_growth_curve_per_line.svg', dpi = 300)

## Fit linear model
I fit here a linear regression model taking into account all the cumulatve sums of all the splits for each line. We extrapolate the slope as the rate of growth of the line.

In [None]:
cumulative_df = pd.concat(cumulative_dict_dfs.values(), keys = cumulative_dict_dfs.keys()).reset_index()
cumulative_df['line'] = cumulative_df.level_0.apply(lambda x: x.split('_')[0])

In [None]:
cumulative_df

In [None]:
# Initialize a grid of plots with an Axes for each walk
grid = sns.FacetGrid(cumulative_df.sort_values(by = 'split_time'), col="line", palette=line_palette,
                     col_wrap=4, height=5)

grid.map(sns.regplot, "split_time", "cumulative", order = 1)

# Adjust the arrangement of the plots
grid.fig.tight_layout(w_pad=1)

In [None]:
fitted_model = {}
for l in cumulative_df.line.unique():
    
    sub = cumulative_df[cumulative_df.line == l]
    sub = sub.sort_values(by = 'datetime')

    key = f'{l}'
    fitted_model[key] = {}
    fitted_model[key]['slope'], fitted_model[key]['intercept'], fitted_model[key]['rvalue'], fitted_model[key]['pvalue'], fitted_model[key]['stderr'] = stats.linregress(sub['split_time'], sub['cumulative'].fillna(0))

In [None]:
fitted_model_df = pd.DataFrame.from_dict(fitted_model).T.reset_index()
fitted_model_df['line'] = fitted_model_df['index'].apply(lambda x: x.split('_')[0])
fitted_model_df.sort_values(by = 'slope')

In [None]:
fitted_model_df.rvalue.mean()

In [None]:
fitted_model_df.sort_values(by = 'slope').to_csv('../../data/csv/iPSC_fitted_lm_grouped.csv')