In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import functions
%matplotlib inline

In [None]:
# load the gene expression dataset GSE22820

path = '/Users/minhnguyen/IronHack2023-2024/Bootcamp/Labs/5_Mid_bootcamp_project_venv/data/Breast_GSE22820.csv'
df = pd.read_csv(path)
df.head(10)

In [None]:
from sklearn.preprocessing import QuantileTransformer
def quantileTransformer1(df:pd.DataFrame, qt = None):
    """
    Apply quantile transformation to a DataFrame, transforming numerical columns to follow a normal distribution.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing numerical data for transformation.
    - qt (QuantileTransformer or None, optional): An existing QuantileTransformer object to apply, or None to fit a new one. Default is None.

    Returns:
    - Tuple: A tuple containing two elements:
        1. pd.DataFrame: Transformed DataFrame with numerical columns following a normal distribution.
        2. QuantileTransformer: The fitted or provided QuantileTransformer object.
    """
    df_transformed = df.copy()

    if qt is None:
        # Fit new dataset, if none is given
        qt = QuantileTransformer (output_distribution ='normal')
        qt.fit(df_transformed[df_transformed.columns[2:]])
    
    # Apply Transformation to data set
    df_transformed[df_transformed.columns[2:]] = qt.transform(df_transformed[df_transformed.columns[2:]])
    return df_transformed, qt

In [None]:
# transform data using quantileTransfromer function
df_transformed, qt = quantileTransformer1(df)

In [None]:
df_transformed

In [None]:
qt

In [None]:
# melting the df_transformed for boxplot
melted_df_transformed = df_transformed.melt(id_vars=['samples', 'type'], var_name='gene', value_name='expression')
melted_df_transformed

In [None]:
# plot qt transformed data on boxplot

plt.figure(figsize=(20, 8))  # Adjust figure size if needed
sns.boxplot(x = 'samples',  y='expression', hue = 'type',  data=melted_df_transformed, width = 0.3)
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.legend(loc = 'upper right')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import scipy.stats as st
def getListDifferentialGenes1(df:pd.DataFrame, psig=0.05, tumor_value = 'tumoral', tumor_column = 'type'):
    """
    Identify differentially expressed genes between a specified tumor group and the rest of the samples in a DataFrame.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing gene expression data.
    - psig (float, optional): Significance threshold for identifying differentially expressed genes. Default is 0.05.
    - tumor_value (str, optional): The specific tumor group for which differential expression is assessed. Default is 'tumoral'.
    - tumor_column (str, optional): The column defining the tumor groups. Default is 'type'.

    Returns:
    - list: A list of gene names that are differentially expressed in the specified tumor group based on a t-test.
    """    
    df2 = df.copy()

    df_cancer = df2[df2[tumor_column]==tumor_value]
    df_normal = df2[df2[tumor_column]!=tumor_value]

    differential_expressed_genes = []
    for col in df_cancer.columns[2:]:
        t, pvalue= st.ttest_ind(df_cancer[col],df_normal[col], equal_var = False, alternative = 'two-sided')
        if pvalue < psig:
            differential_expressed_genes.append(str(col))

    return differential_expressed_genes

In [None]:
# identify differential expressed genes

differential_expressed_genes = getListDifferentialGenes1(df_transformed, tumor_value = 'primary_breast_cancer', tumor_column = 'type')

In [None]:
differential_expressed_genes


In [None]:
len(differential_expressed_genes)

In [None]:
df_tranformed_DEG = df_transformed.copy()
df_tranformed_DEG = df_tranformed_DEG[['samples', 'type'] + differential_expressed_genes]

In [None]:
df_tranformed_DEG

In [None]:
# model training without get rid of correlated variables
model, cm, scores = functions.trainLogisticModel(df_tranformed_DEG, ColumnToPredict='type', tumor_value = 'primary_breast_cancer')

In [None]:
model

In [None]:
y_pred = model.predict(X = df_tranformed_DEG.drop(columns = ['samples', 'type']))
y_test = df_tranformed_DEG['type']

In [None]:
functions.LogisticModelView(model, y_pred, y_test, pos_label = 'primary_breast_cancer')

In [None]:
r_values = df_tranformed_DEG[2:].corr()


In [None]:
# Create a mask for values above the threshold
mask = (r_values.to_numpy() > 0.95) & (r_values.index.to_numpy() != r_values.columns.to_numpy()[:, None])

# Extract the column and index names where the mask is True
correlated_columns, correlated_rows = np.where(mask)

    for col, index in zip(r_values.columns[correlated_columns], r_values.index[correlated_rows]):
        value = r_values.at[index, col]
        correlated_genes_list.append([col, value, index])

    exclude_list = []
    included_columns = []

    for col in correlated_genes_list:
        if col not in included_columns:
            exclude_list.append(col[0])
            included_columns.append(col[2])

    return correlated_genes_list, exclude_list, r_values

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a sample DataFrame
data = {'A': [1, 2, 3, 4],
        'B': [2, 4, 6, 8],
        'C': [1, 1, 1, 1],
        'D': [5, 4, 3, 2]}

df = pd.DataFrame(data)

# Calculate the correlation matrix
corr_matrix = df.corr()

# Set the threshold for correlation
threshold = 0.8  # You can adjust this threshold as needed

# Find highly correlated pairs
highly_correlated_pairs = (corr_matrix.abs() > threshold) & (corr_matrix.abs() < 1)

# Print the highly correlated pairs
print("Highly correlated pairs:")
for col in df.columns:
    correlated_cols = highly_correlated_pairs[col][highly_correlated_pairs[col]].index.tolist()
    for correlated_col in correlated_cols:
        if correlated_col != col:
            print(f"{col} and {correlated_col}: {corr_matrix.loc[col, correlated_col]:.2f}")

# Visualize the correlation matrix using seaborn
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# testing function from Tim
# Create a sample DataFrame
d = {'x1': [1, 4, 4, 5, 6, 0, 0, 8, 2, 4], 
     'x2': [0, 0, 8, 2, 4,2, 8, 8, 10, 12], 
     'x3': [2, 8, 8, 10, 12,-1, -4, -4, -4, -5], 
     'x4': [-1, -4, -4, -4, -5,1, 4, 4, 5, 6],
    'x5': [1, 4, 4, 5, 6, 0, 0, 8, 2, 4], 
     'x6': [0, 0, 8, 2, 4,2, 8, 8, 10, 12], 
     'x7': [2, 8, 8, 10, 12,-1, -4, -4, -4, -5], 
     'x8': [-1, -4, -4, -4, -5,1, 4, 4, 5, 6]}

df_test = pd.DataFrame(d)
df_test

In [None]:
# Calculate the correlation matrix
corr_matrix = df_test.corr()
corr_matrix

In [None]:
highly_correlated_pairs = (corr_matrix.abs() > 0.5) & (corr_matrix.abs() < 1)
highly_correlated_pairs
for col in df_test.columns:
    correlated_cols = highly_correlated_pairs[col][highly_correlated_pairs[col]].index.tolist()
    correlated_cols
    #for correlated_col in correlated_cols:
     #   if correlated_col != col:
            #print(f"{col} and {correlated_col}: {corr_matrix.loc[col, correlated_col]:.2f}")
correlated_cols

In [None]:
# test function from Tim --> seem to not working

correlated_genes_list = []
mask = (corr_matrix.to_numpy() > 0.5) & (corr_matrix.index.to_numpy() != corr_matrix.columns.to_numpy()[:, None])
correlated_columns, correlated_rows = np.where(mask)
for col, index in zip(corr_matrix.columns[correlated_columns], corr_matrix.index[correlated_rows]):
    value = corr_matrix.at[index, col]
    correlated_genes_list.append([col, value, index])

exclude_list = []
included_columns = []

for col in correlated_genes_list:
    if col[0] not in included_columns:
        exclude_list.append(col[0])
        included_columns.append(col[2])
display(correlated_genes_list)
display(exclude_list)
display(included_columns)

In [None]:
correlated_genes_list[0]

In [None]:
exclude_list_1 = []
included_columns_1 = []
for col in correlated_genes_list:
    print(col)
    print(col[0])
    print(col[2])
    if col[0] not in included_columns_1:
        print( col[0] not in included_columns_1)
        exclude_list_1.append(col[0])
        included_columns_1.append(col[2])
        display(exclude_list_1)
        display(included_columns_1)


In [None]:
# Set the threshold for correlation
threshold = 0.8  # You can adjust this threshold as needed

# Find highly correlated pairs
highly_correlated_pairs = (corr_matrix.abs() > threshold) & (corr_matrix.abs() < 1)
highly_correlated_pairs

In [None]:
highly_correlated_pairs[col][highly_correlated_pairs[col]].index.tolist()

In [None]:
cor_list=highly_correlated_pairs['x1'][highly_correlated_pairs['x1']].index.tolist()
cor_list