In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO


In [None]:

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)

with open("brain_networks.csv", "r") as f:
    csv_data = f.read()

lines = csv_data.strip().split('\n')

# Parse metadata rows
network_row = lines[0].split(',')[1:]
node_row = lines[1].split(',')[1:]
hemi_row = lines[2].split(',')[1:]

data_rows = []
for line in lines[4:]:
    parts = line.split(',')
    row_id = parts[0]
    values = [float(x) for x in parts[1:]]
    data_rows.append([row_id] + values)

df = pd.DataFrame(data_rows)
df.columns = ['row_id'] + [f'col_{i}' for i in range(len(network_row))]

metadata = pd.DataFrame({
    'column': [f'col_{i}' for i in range(len(network_row))],
    'network': [int(x) for x in network_row],
    'node': [int(x) for x in node_row],
    'hemi': hemi_row
})

print("="*80)
print("EXPLORATORY DATA ANALYSIS - NETWORK NODE DATA")
print("="*80)


In [None]:

print("\n1. DATASET OVERVIEW")
print("-" * 80)
print(f"Number of rows (observations): {len(df)}")
print(f"Number of columns (features): {len(df.columns) - 1}")
print(f"Total data points: {(len(df) * (len(df.columns) - 1))}")
print(f"\nData shape: {df.shape}")
print(f"\nData types:\n{df.dtypes.value_counts()}")


In [None]:

print("\n2. METADATA SUMMARY")
print("-" * 80)
print(f"Unique networks: {metadata['network'].nunique()} - {sorted(metadata['network'].unique())}")
print(f"Unique nodes per network: {metadata.groupby('network')['node'].nunique().to_dict()}")
print(f"Hemispheres: {metadata['hemi'].unique()}")
print(f"Hemisphere distribution: {metadata['hemi'].value_counts().to_dict()}")

print(f"\nNetwork-Node-Hemisphere combinations:")
combo_counts = metadata.groupby(['network', 'node', 'hemi']).size()
print(f"Total combinations: {len(combo_counts)}")


In [None]:

print("\n3. STATISTICAL SUMMARY")
print("-" * 80)
data_cols = [col for col in df.columns if col != 'row_id']
data_values = df[data_cols].values.flatten()

print(f"Mean: {np.mean(data_values):.4f}")
print(f"Median: {np.median(data_values):.4f}")
print(f"Std Dev: {np.std(data_values):.4f}")
print(f"Min: {np.min(data_values):.4f}")
print(f"Max: {np.max(data_values):.4f}")
print(f"Range: {np.max(data_values) - np.min(data_values):.4f}")
print(f"Q1 (25%): {np.percentile(data_values, 25):.4f}")
print(f"Q3 (75%): {np.percentile(data_values, 75):.4f}")
print(f"IQR: {np.percentile(data_values, 75) - np.percentile(data_values, 25):.4f}")


In [None]:


print("\n4. MISSING VALUES")
print("-" * 80)
missing = df.isnull().sum().sum()
print(f"Total missing values: {missing}")
print(f"Percentage missing: {(missing / df.size * 100):.2f}%")

# visualizations
fig = plt.figure(figsize=(18, 12))


In [None]:

ax1 = plt.subplot(3, 3, 1)
plt.hist(data_values, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of All Values')
plt.axvline(np.mean(data_values), color='red', linestyle='--', label=f'Mean: {np.mean(data_values):.2f}')
plt.axvline(np.median(data_values), color='green', linestyle='--', label=f'Median: {np.median(data_values):.2f}')
plt.legend()


In [None]:

ax2 = plt.subplot(3, 3, 2)
plt.boxplot(data_values, vert=True)
plt.ylabel('Value')
plt.title('Box Plot of All Values')
plt.grid(axis='y', alpha=0.3)


In [None]:


ax3 = plt.subplot(3, 3, 3)
network_data = []
network_labels = []
for net in sorted(metadata['network'].unique()):
    net_cols = metadata[metadata['network'] == net]['column'].tolist()
    net_indices = [i for i, col in enumerate(data_cols) if col in net_cols]
    net_values = df.iloc[:, [i+1 for i in net_indices]].values.flatten()
    network_data.append(net_values)
    network_labels.append(f'Net {net}')

plt.boxplot(network_data, labels=network_labels)
plt.xlabel('Network')
plt.ylabel('Value')
plt.title('Distribution by Network')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)


In [None]:


ax4 = plt.subplot(3, 3, 4)
lh_cols = metadata[metadata['hemi'] == 'lh']['column'].tolist()
rh_cols = metadata[metadata['hemi'] == 'rh']['column'].tolist()
lh_indices = [i for i, col in enumerate(data_cols) if col in lh_cols]
rh_indices = [i for i, col in enumerate(data_cols) if col in rh_cols]
lh_values = df.iloc[:, [i+1 for i in lh_indices]].values.flatten()
rh_values = df.iloc[:, [i+1 for i in rh_indices]].values.flatten()

plt.boxplot([lh_values, rh_values], labels=['Left (lh)', 'Right (rh)'])
plt.ylabel('Value')
plt.title('Distribution by Hemisphere')
plt.grid(axis='y', alpha=0.3)


In [None]:


ax5 = plt.subplot(3, 3, 5)
col_means = df[data_cols].mean()
plt.plot(col_means.values, marker='o', markersize=3, linewidth=1)
plt.xlabel('Column Index')
plt.ylabel('Mean Value')
plt.title('Mean Value per Column')
plt.grid(alpha=0.3)


In [None]:


ax6 = plt.subplot(3, 3, 6)
sample_cols = data_cols[:min(20, len(data_cols))]
corr = df[sample_cols].corr()
sns.heatmap(corr, cmap='coolwarm', center=0, square=True, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Heatmap (First 20 Columns)')


In [None]:


ax7 = plt.subplot(3, 3, 7)
row_means = df[data_cols].mean(axis=1)
row_stds = df[data_cols].std(axis=1)
plt.scatter(row_means, row_stds, alpha=0.6, s=100)
plt.xlabel('Row Mean')
plt.ylabel('Row Std Dev')
plt.title('Row Statistics: Mean vs Std Dev')
plt.grid(alpha=0.3)


In [None]:

ax8 = plt.subplot(3, 3, 8)
network_counts = metadata['network'].value_counts().sort_index()
plt.bar(network_counts.index, network_counts.values, color='teal', alpha=0.7, edgecolor='black')
plt.xlabel('Network')
plt.ylabel('Number of Columns')
plt.title('Columns per Network')
plt.grid(axis='y', alpha=0.3)


In [None]:


ax9 = plt.subplot(3, 3, 9)
from scipy import stats
stats.probplot(data_values, dist="norm", plot=plt)
plt.title('Q-Q Plot (Normal Distribution)')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('network_eda.png', dpi=300, bbox_inches='tight')
print("\n" + "="*80)
print("Visualization saved as 'network_eda.png'")
print("="*80)
plt.show()


In [None]:


print("\n5. DETAILED STATISTICS BY GROUP")
print("-" * 80)


print("\nStatistics by Network:")
for net in sorted(metadata['network'].unique()):
    net_cols = metadata[metadata['network'] == net]['column'].tolist()
    net_indices = [i for i, col in enumerate(data_cols) if col in net_cols]
    net_values = df.iloc[:, [i+1 for i in net_indices]].values.flatten()
    print(f"  Network {net}: Mean={np.mean(net_values):.2f}, Std={np.std(net_values):.2f}, "
          f"Min={np.min(net_values):.2f}, Max={np.max(net_values):.2f}")


print("\nStatistics by Hemisphere:")
print(f"  Left (lh):  Mean={np.mean(lh_values):.2f}, Std={np.std(lh_values):.2f}, "
      f"Min={np.min(lh_values):.2f}, Max={np.max(lh_values):.2f}")
print(f"  Right (rh): Mean={np.mean(rh_values):.2f}, Std={np.std(rh_values):.2f}, "
      f"Min={np.min(rh_values):.2f}, Max={np.max(rh_values):.2f}")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)