# Some machine learning examples

## Load libraries

In [1]:
import numpy as np
from nilearn import datasets
from nilearn import input_data
from nilearn import plotting

## Load datasets

In [2]:
# Load fMRI data
data_path = r"F:\_Sorrow\PhD\FMRI\Datasets\nilearn_data"
dataset = datasets.fetch_development_fmri(n_subjects=1, data_dir=data_path)
func_filename = dataset.func[0]
confounds_filename = dataset.confounds[0]

# Load atlas data
power = datasets.fetch_coords_power_2011()

## Extract signals from parcellation

In [3]:
# Extract coordinates
coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T

# Create masker object (spheres with 5 radius)
spheres_masker = input_data.NiftiSpheresMasker(
    seeds=coords, radius=5, detrend=True, standardize=True)


# Extract timeseries from ROIs
timeseries = spheres_masker.fit_transform(func_filename)

# Plot shape of timeseries
timeseries.shape

(168, 264)

## Run k-means clustering on timeseries

The `KMeans` algorithm clusters data by trying to separate samples in $n$ groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. This algorithm requires the number of clusters to be specified. It scales well to large number of samples and has been used across a large range of application areas in many different fields ([more](https://scikit-learn.org/stable/modules/clustering.html#k-means)).


In [4]:
from sklearn.cluster import KMeans

# Creare KMeans object and specify n_cluster as 5
clustering = KMeans(n_clusters=5)

# Cluster timeseries using .fit method
clustering.fit(timeseries)



In [5]:
# Print clustering labels & shape
clustering.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 4, 2,
       4, 2, 2, 2, 4, 2, 1, 0, 0, 4, 3, 4, 3, 3, 4, 4, 4, 3, 4, 2, 2, 4,
       4, 4, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 0, 0, 4, 4, 3, 3,
       0, 3, 3, 3, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 3, 3, 3, 3,
       3, 3, 3, 2, 2, 2, 2, 2, 3, 0, 4, 1, 1, 1, 1, 3, 3, 1, 1, 2, 3, 3,
       1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])

In [11]:
# Cluster brain regions
clustering.fit(timeseries.T)

KMeans(n_clusters=5)

In [12]:
# Print clustering labels & shape
clustering.labels_

array([2, 2, 3, 0, 2, 3, 3, 1, 2, 2, 0, 4, 1, 3, 4, 4, 4, 2, 1, 2, 1, 4,
       4, 4, 3, 1, 4, 4, 4, 1, 0, 1, 1, 0, 0, 4, 2, 0, 4, 0, 4, 3, 4, 4,
       3, 3, 4, 3, 2, 4, 4, 3, 2, 3, 3, 3, 2, 3, 3, 3, 0, 3, 4, 2, 3, 2,
       3, 3, 3, 4, 3, 1, 0, 1, 1, 0, 4, 2, 0, 0, 2, 2, 0, 0, 3, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 2, 2, 0, 3, 3,
       0, 0, 3, 0, 0, 0, 4, 0, 1, 2, 4, 3, 2, 4, 1, 2, 3, 2, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 2, 2, 1, 2, 2, 4, 1, 4, 4, 1, 4, 1, 1, 4, 4, 1, 1,
       1, 3, 1, 1, 3, 4, 1, 1, 4, 1, 2, 4, 4, 1, 1, 4, 1, 1, 1, 0, 3, 3,
       0, 0, 1, 3, 3, 2, 2, 3, 3, 1, 1, 3, 3, 4, 1, 0, 4, 0, 0, 0, 3, 0,
       1, 3, 0, 0, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 0,
       0, 2, 4, 1, 4, 2, 2, 2, 4, 3, 3, 4, 4, 3, 1, 1, 3, 1, 1, 3, 3, 2,
       2, 2, 4, 2, 1, 2, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4],
      dtype=int32)

## Run PCA on timeseries

PCA - linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space.

In [6]:
from sklearn.decomposition import PCA 

decomposition = PCA(n_components=10)
decomposition.fit(timeseries)

# Print shape of components
# 打印主成分矩阵的形状
print("Components shape:", decomposition.components_.shape)
# components_.shape = (10, n_features)，每一行是一个主成分的权重向量

# Print variance eplained by each component
# 打印每个主成分解释的方差比例
print("Explained variance ratio:", decomposition.explained_variance_ratio_)
# 输出长度为 10 的数组，每个值表示该主成分占总方差的比例
# 结果仅有44%的变化信息，这对于 fMRI 数据来说通常是不够的，所以10个成分不足以涵盖原始数据

Components shape: (10, 264)
Explained variance ratio: [0.13364732 0.07679486 0.05781019 0.04639002 0.0377177  0.03489377
 0.03200072 0.02996505 0.02536517 0.02175923]


In [None]:
#fMRI 数据通常非常高维（每个 voxel 都是一个特征），同时伴随很多噪声。
#PCA 可以提取最重要的几个成分，去掉那些贡献很小、可能是噪声的成分，从而得到更干净的时间序列。

#PCA 可以先把数据压缩成几十个主要成分，再做功能连接分析或其他统计分析。

#PCA 可以揭示不同任务或静息态下大脑活动的全脑模式（spatial pattern）。
#例如，你可以看到哪一组 voxel 在同一时间内一起变化，这些模式可能对应不同脑网络。

#降维后的主成分可以作为回归、分类或连接分析的输入特征。
#比如：提取主成分后再计算成分间的相关性，比直接计算 voxel 相关性更稳定。

#PCA 可以把复杂的 voxel 数据压缩成 2-3 个维度，方便绘制脑活动随时间变化的轨迹或空间模式。