In [2]:
import GEOparse
import numpy as np
import pandas as pd

# Download the dataset
gse = GEOparse.get_GEO("GSE10245")

# Extract expression data
data = gse.pivot_samples('VALUE').values
print("Expression Data:\n", data[:5, :5])

# Initialize Parameters
def initialize_parameters(data):
    n, p = data.shape
    mean_matrix = np.mean(data, axis=0)
    row_cov = np.cov(data, rowvar=True)
    col_cov = np.cov(data, rowvar=False)
    return mean_matrix, row_cov, col_cov

mean_matrix, row_cov, col_cov = initialize_parameters(data)
print("Initial Mean Matrix:\n", mean_matrix)
print("Initial Row Covariance Matrix:\n", row_cov)
print("Initial Column Covariance Matrix:\n", col_cov)

# EM Algorithm
def em_algorithm(data, max_iter=100, tol=1e-6):
    n, p = data.shape
    mean_matrix, row_cov, col_cov = initialize_parameters(data)
    
    for iteration in range(max_iter):
        # E-Step: Calculate the expected values
        inv_col_cov = np.linalg.inv(col_cov)
        inv_row_cov = np.linalg.inv(row_cov)
        
        S = np.zeros((n, p))
        for i in range(n):
            S[i, :] = data[i, :] - mean_matrix
        EZZT = np.zeros((p, p))
        for j in range(p):
            EZZT[:, j] = S[:, j].T @ inv_row_cov @ S[:, j]
        
        # M-Step: Update mean matrix, row covariance, and column covariance
        new_mean_matrix = np.mean(data, axis=0)
        
        new_row_cov = np.zeros((n, n))
        for i in range(n):
            new_row_cov += (data[i, :] - new_mean_matrix).reshape(-1, 1) @ inv_col_cov @ (data[i, :] - new_mean_matrix).reshape(1, -1)
        new_row_cov /= p
        
        new_col_cov = EZZT / n
        
        # Check for convergence
        if np.linalg.norm(new_mean_matrix - mean_matrix) < tol and np.linalg.norm(new_row_cov - row_cov) < tol and np.linalg.norm(new_col_cov - col_cov) < tol:
            break
        
        mean_matrix = new_mean_matrix
        row_cov = new_row_cov
        col_cov = new_col_cov
        
    return mean_matrix, row_cov, col_cov

mean_matrix, row_cov, col_cov = em_algorithm(data)
print("Final Mean Matrix:\n", mean_matrix)
print("Final Row Covariance Matrix:\n", row_cov)
print("Final Column Covariance Matrix:\n", col_cov)

02-Jul-2024 15:20:46 DEBUG utils - Directory ./ already exists. Skipping.
02-Jul-2024 15:20:46 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10245/soft/GSE10245_family.soft.gz to ./GSE10245_family.soft.gz
100%|██████████| 37.1M/37.1M [00:06<00:00, 6.05MB/s]
02-Jul-2024 15:20:54 DEBUG downloader - Size validation passed
02-Jul-2024 15:20:54 DEBUG downloader - Moving /tmp/tmpl5oqcnww to /local_scratch/zabel/tsne/diss/GSE10245_family.soft.gz
02-Jul-2024 15:20:54 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10245/soft/GSE10245_family.soft.gz
02-Jul-2024 15:20:54 INFO GEOparse - Parsing ./GSE10245_family.soft.gz: 
02-Jul-2024 15:20:54 DEBUG GEOparse - DATABASE: GeoMiame
02-Jul-2024 15:20:54 DEBUG GEOparse - SERIES: GSE10245
02-Jul-2024 15:20:54 DEBUG GEOparse - PLATFORM: GPL570
  return read_csv(StringIO(data), index_col=None, sep="\t")
02-Jul-2024 15:20:56 DEBUG GEOparse - SAMPLE: GSM258551
02-Jul-2024 15:20:56

Expression Data:
 [[ 9.12990455  9.84334874  9.7306612   9.03216455 10.28179284]
 [ 8.03402159  7.97333202  8.8340448   7.72396539  9.04080026]
 [ 3.56451954  4.99485242  5.06601813  4.95857959  4.95183472]
 [ 4.74649031  5.1973056   5.23461843  6.07817997  5.20563189]
 [ 2.32069777  2.2485205   2.25950441  2.26278701  2.20753107]]
Initial Mean Matrix:
 [4.72672995 4.71763363 4.72578372 4.71626304 4.73650073 4.74106281
 4.73704479 4.72382097 4.7323169  4.73584457 4.72778338 4.73561321
 4.710631   4.73853388 4.75964537 4.74495289 4.72632923 4.72760608
 4.73334677 4.72068823 4.72718019 4.74942961 4.72277333 4.72475267
 4.75142975 4.72824177 4.70367336 4.73197235 4.73787767 4.73162003
 4.71937762 4.74847405 4.73041881 4.74277583 4.75395935 4.75378439
 4.72411318 4.75146962 4.74429228 4.75219168 4.72846827 4.72723604
 4.74465499 4.75444345 4.74674357 4.72228699 4.72141935 4.72910873
 4.74094606 4.74032365 4.72270475 4.7383405  4.74697681 4.72129613
 4.71976304 4.72542842 4.74909035 4.71492