# Data preparation

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy.stats
#%matplotlib widget
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
print("Current working directory:", os.getcwd())
base_dir = '.' 

dataframes = {}

files = ['Part1_Results.csv', 'Part2_Results.csv', 'Part3_Results.csv']

for file in files:
    file_path = os.path.join(base_dir, file)
    if os.path.exists(file_path):
        # Extract name without extension for dictionary key
        name = file.replace('.csv', '')
        dataframes[name] = pd.read_csv(file_path, sep=';', index_col=0)
        print(f"Loaded {name}: {dataframes[name].shape}")
    else:
        print(f"Warning: {file} not found!")

# To access the individual data we have to use
# dataframes['Part1_Results']
# dataframes['Part2_Results']

Current working directory: /Users/ghalijaidi/Desktop/NSSP Projet/Neuroscience-
Loaded Part1_Results: (232, 9)
Loaded Part2_Results: (232, 16)
Loaded Part3_Results: (134, 16)


In [3]:
print(dataframes['Part1_Results'].info())
print(dataframes['Part2_Results'].info())
print(dataframes['Part3_Results'].info())

<class 'pandas.core.frame.DataFrame'>
Index: 232 entries, 0 to 231
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cell_id       232 non-null    object 
 1   cell_type     232 non-null    object 
 2   firing_rate   232 non-null    float64
 3   ap_threshold  232 non-null    float64
 4   ap_duration   230 non-null    float64
 5   mean_vm       232 non-null    float64
 6   std_vm        232 non-null    float64
 7   fft_low       232 non-null    float64
 8   fft_high      232 non-null    float64
dtypes: float64(7), object(2)
memory usage: 18.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 232 entries, 0 to 231
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cell_id            232 non-null    object 
 1   cell_type          232 non-null    object 
 2   numb_events        232 non-null    int64  
 3   wp_avg             232 no

In [4]:
# Check alignment of cell_id between Part1 and Part2
for i in range(len(dataframes['Part1_Results']['cell_id'])):
    if dataframes['Part1_Results']['cell_id'].iloc[i] != dataframes['Part2_Results']['cell_id'].iloc[i]:
        print("Mismatch between part 1 and 2 in cell_id at index", i)

In [5]:
# Check alignment of cell_id between Part1 and Part3
for i in range(len(dataframes['Part3_Results']['cell_id'])):
    if dataframes['Part1_Results']['cell_id'].iloc[i] != dataframes['Part3_Results']['cell_id'].iloc[i]:
        print("Mismatch between part 1 and 3 in cell_id at index", i)
        


Mismatch between part 1 and 3 in cell_id at index 0
Mismatch between part 1 and 3 in cell_id at index 1
Mismatch between part 1 and 3 in cell_id at index 2
Mismatch between part 1 and 3 in cell_id at index 3
Mismatch between part 1 and 3 in cell_id at index 4
Mismatch between part 1 and 3 in cell_id at index 5
Mismatch between part 1 and 3 in cell_id at index 6
Mismatch between part 1 and 3 in cell_id at index 7
Mismatch between part 1 and 3 in cell_id at index 8
Mismatch between part 1 and 3 in cell_id at index 9
Mismatch between part 1 and 3 in cell_id at index 10
Mismatch between part 1 and 3 in cell_id at index 11
Mismatch between part 1 and 3 in cell_id at index 12
Mismatch between part 1 and 3 in cell_id at index 13
Mismatch between part 1 and 3 in cell_id at index 14
Mismatch between part 1 and 3 in cell_id at index 15
Mismatch between part 1 and 3 in cell_id at index 16
Mismatch between part 1 and 3 in cell_id at index 17
Mismatch between part 1 and 3 in cell_id at index 18
Mis

In [6]:
#Checking whether the cells are only misaligned or entirely different.

part1_ids = set(dataframes['Part1_Results']['cell_id'])
part3_ids = set(dataframes['Part3_Results']['cell_id'])

missing_in_part1 = [cid for cid in part3_ids if cid not in part1_ids]

if missing_in_part1:
    print(f"{len(missing_in_part1)} cell_id(s) from Part3 not found in Part1. Examples: {missing_in_part1[:10]}")
else:
    print("All Part3 cell_ids are present in Part1.")        
        

8 cell_id(s) from Part3 not found in Part1. Examples: ['SC914_1', 'TK390_1', 'SC915_1', 'SC903_1', 'TK358_3', 'SC909_1', 'SC913_1', 'SC907_1']


What we conclude from this is that Part 3 recordings concern mostly a subset of part 1&2 cells. On top of that, 8 of the cells present in part 3 are entirely absent from part 1 and 2. Other than that, the misalignments of the indeces do not mean that all cell are entirely new, the dataframe simply shows a different order. 



Let's 

In [7]:
y1 = dataframes['Part1_Results'][['cell_type']].copy()
y2 = dataframes['Part2_Results'][['cell_type']].copy()
y3 = dataframes['Part3_Results'][['cell_type']].copy()



y1.head()
y2.head()
y3.head()

label_encoder = LabelEncoder()

y1_encoded = label_encoder.fit_transform(y1)
y2_encoded = label_encoder.transform(y2)
y3_encoded = label_encoder.transform(y3)

# See the mapping
print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label} -> {i}")

print(f"\nEncoded y1 shape: {y1_encoded.shape}")
print(f"\nEncoded y2 shape: {y2_encoded.shape}")
print(f"\nEncoded y3 shape: {y3_encoded.shape}")



Label mapping:
  EXC -> 0
  PV -> 1
  SST -> 2
  VIP -> 3

Encoded y1 shape: (232,)

Encoded y2 shape: (232,)

Encoded y3 shape: (134,)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [8]:
# We check if there is class imbalance 
value_counts = dataframes['Part1_Results']['cell_type'].value_counts()
percentages = dataframes['Part1_Results']['cell_type'].value_counts(normalize=True) * 100

print("Cell Type Distribution")

for cell_type in value_counts.index:
    count = value_counts[cell_type]
    pct = percentages[cell_type]
    print(f"{cell_type:<10} {count:<10} {pct:.2f}%")


print(f"Total cells: {len(dataframes['Part1_Results'])}")

Cell Type Distribution
EXC        86         37.07%
SST        72         31.03%
PV         49         21.12%
VIP        25         10.78%
Total cells: 232


In [9]:
#Same check for part 2
value_counts = dataframes['Part2_Results']['cell_type'].value_counts()
percentages = dataframes['Part2_Results']['cell_type'].value_counts(normalize=True) * 100

print("Cell Type Distribution")

for cell_type in value_counts.index:
    count = value_counts[cell_type]
    pct = percentages[cell_type]
    print(f"{cell_type:<10} {count:<10} {pct:.2f}%")


print(f"Total cells: {len(dataframes['Part2_Results'])}")

Cell Type Distribution
EXC        86         37.07%
SST        72         31.03%
PV         49         21.12%
VIP        25         10.78%
Total cells: 232


In [10]:
#Same check for part 3 : 
value_counts = dataframes['Part3_Results']['cell_type'].value_counts()
percentages = dataframes['Part3_Results']['cell_type'].value_counts(normalize=True) * 100

print("Cell Type Distribution")

for cell_type in value_counts.index:
    count = value_counts[cell_type]
    pct = percentages[cell_type]
    print(f"{cell_type:<10} {count:<10} {pct:.2f}%")

print(f"Total cells: {len(dataframes['Part3_Results'])}")

Cell Type Distribution
EXC        67         50.00%
SST        37         27.61%
PV         22         16.42%
VIP        8          5.97%
Total cells: 134


Part 3 shows a very different distribution of cell types compared to part 1 and 2. 

## Data splitting

Let's say we want to build 3 models first, one for each dataset, and then one big supermodel which would include all features across datasets merged on cell_id. The goal is to compare the accuracy of all our different models.

In [17]:
# We start by training on Part 1 features 
X_1 = dataframes['Part1_Results'].copy()
X_1.drop(columns=['cell_id', 'cell_type'], inplace=True)

print(f"X_1 shape: {X_1.shape}")
print(f"y1_encoded shape: {y1_encoded.shape}")
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_1, y1_encoded, 
    test_size=0.2, 
    stratify=y1_encoded,  #ensures percentages of cell types are preserved
    random_state=42  
)
print(f"Training set size: {len(X_train1)}")
print(f"Test set size: {len(X_test1)}")

X_2 = dataframes['Part2_Results'].copy()
X_2.drop(columns=['cell_id', 'cell_type'], inplace=True)

print(f"X_2 shape: {X_2.shape}")
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_2, y2_encoded, 
    test_size=0.2, 
    stratify=y2_encoded,  #ensures percentages of cell types are preserved
    random_state=42  
)
print(f"Training set size: {len(X_train2)}")
print(f"Test set size: {len(X_test2)}")

X_3 = dataframes['Part3_Results'].copy()
X_3.drop(columns=['cell_id', 'cell_type'], inplace=True)
print(f"X_3 shape: {X_3.shape}")
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X_3, y3_encoded, 
    test_size=0.2, 
    stratify=y3_encoded,  #ensures percentages of cell types are preserved
    random_state=42  
)
print(f"Training set size: {len(X_train3)}")
print(f"Test set size: {len(X_test3)}")

X_1 shape: (232, 7)
y1_encoded shape: (232,)
Training set size: 185
Test set size: 47
X_2 shape: (232, 14)
Training set size: 185
Test set size: 47
X_3 shape: (134, 14)
Training set size: 107
Test set size: 27
