# Contents

#### 1. Prepare to work with data (import packages and set working directory)
#### 2. Read in and process data (create separate dataframes for gene expression data and phenotype data)
#### 3. Combine dataframes (with gene name as index)

## 1 - Prepare to work with data

### 1.1 - Import Packages

In [1]:
import os  # <- package used to work with system filepaths
import pandas as pd  # <- package used to import and organize data
import numpy as np  # <- package used to import and organize data
import math
import seaborn as sns  # <- package used to plot graphs
from matplotlib import pyplot as plt  # <- another package used to plot graphs
from itertools import cycle  # <- package used to iterate down rows
from ipywidgets import widgets  # <- widget tool to generate button
from IPython.display import display  # <- displays button
from tkinter import Tk, filedialog  # <- Tkinter is a GUI package
from tqdm.notebook import tqdm
import pingouin as pg
from pingouin import ttest
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
import requests
pd.set_option('display.max_columns', 50)
print("done step 1")

done step 1


  return warn(


### 1.2 - Set working directory to where the CSV file with experiment data is


In [2]:
os.getcwd()
os.chdir('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data')

## 2 - Read in and tidy data


### 2.1.A - Read in and combine CeNGEN data into one dataframe (for dopaminergic (dopamine-synthesizing) neurons)

In [7]:
df1 = pd.read_csv('GenesExpressed_in_ADE-thrsUnfiltered.csv')
df2 = pd.read_csv('GenesExpressed_in_PDE-thrsUnfiltered.csv')
df3 = pd.read_csv('GenesExpressed_in_CEP-thrsUnfiltered.csv')

ExpressionData_DopaminergicNeurons = pd.merge(df1, df2, on='Gene name', how='outer')
ExpressionData_DopaminergicNeurons = pd.merge(ExpressionData_DopaminergicNeurons, df3, on='Gene name', how='outer')
ExpressionData_DopaminergicNeurons = ExpressionData_DopaminergicNeurons.drop(['Unnamed: 0_x','Gene ID_x', 'Unnamed: 0_y', 'Gene ID_y', 'Unnamed: 0'], axis=1)

# Rearrange columns
new_column_order = ['Gene name', 'ADE_exp_level', 'PDE_exp_level', 'CEP_exp_level']
ExpressionData_DopaminergicNeurons = ExpressionData_DopaminergicNeurons.reindex(columns=new_column_order)

# ExpressionData_DopaminergicNeurons = ExpressionData_DopaminergicNeurons.dropna()

# Save the merged DataFrame as a CSV file to a specific directory
ExpressionData_DopaminergicNeurons.to_csv('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data\\ExpressionData_DopaminergicNeurons.csv', index=False)


print(ExpressionData_DopaminergicNeurons.shape)
print(ExpressionData_DopaminergicNeurons.head())

(18475, 4)
  Gene name  ADE_exp_level  PDE_exp_level  CEP_exp_level
0    flp-33       98000.31       16699.05       1082.030
1    unc-54       77625.40       80205.59     133461.200
2  F09E10.7       59164.51      116352.90      84678.650
3     flp-9       40632.57         953.48        695.145
4   F59F3.6       27777.23       26628.37      16785.460


### 2.1.B - Read in and combine CeNGEN data into one dataframe (for all neurons with dopamine-receptors)

### 2.2 - Calculate mean gene expression level across each neuron and calculate a t-score for the expression level of each gene in that neuron

In [9]:
# Calculate t-score for each gene in each neuron type
t_scores = {}
for neuron_type in ['ADE', 'PDE', 'CEP']:
    # Calculate mean expression level and standard deviation for the current neuron type
    mean_expression = ExpressionData_DopaminergicNeurons[f'{neuron_type}_exp_level'].mean()
    sd_expression = ExpressionData_DopaminergicNeurons[f'{neuron_type}_exp_level'].std()

    # Initialize list to store t-scores
    t_scores[neuron_type] = []
    
    # Calculate t-score for each gene in the current neuron type
    for index, row in ExpressionData_DopaminergicNeurons.iterrows():
        t_score = (row[f'{neuron_type}_exp_level'] - mean_expression) / sd_expression
        t_scores[neuron_type].append(t_score)

# Create a new DataFrame with gene names and corresponding t-scores for each neuron type
ExpressionData_DopaminergicNeurons_T = pd.DataFrame({'Gene name': ExpressionData_DopaminergicNeurons['Gene name']})
for neuron_type in ['ADE', 'PDE', 'CEP']:
    ExpressionData_DopaminergicNeurons_T[f'{neuron_type}_t_score'] = t_scores[neuron_type]

    
ExpressionData_DopaminergicNeurons_T.to_csv('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data\\ExpressionData_DopaminergicNeurons_T.csv', index=False)
print(ExpressionData_DopaminergicNeurons_T.head())


  Gene name  ADE_t_score  PDE_t_score  CEP_t_score
0    flp-33    86.882787    14.631959     0.851163
1    unc-54    68.809301    70.458295   110.468799
2  F09E10.7    52.433639   102.234107    70.074016
3     flp-9    35.994952     0.790589     0.530800
4   F59F3.6    24.591672    23.360470    13.854515


### 2.2.2(?) Standardize t-scores by calculating t-scores of t-scores

In [5]:
t_scores2 = {}
for neuron_type in ['ADE', 'PDE', 'CEP']:
    mean_t_score = ExpressionData_DopaminergicNeurons_T[f'{neuron_type}_t_score'].mean()
    sd_t_score = ExpressionData_DopaminergicNeurons_T[f'{neuron_type}_t_score'].std()

    t_scores2[neuron_type] = []
    
    for index, row in ExpressionData_DopaminergicNeurons_T.iterrows():
        t_score2 = (row[f'{neuron_type}_t_score'] - mean_t_score) / sd_t_score
        t_scores2[neuron_type].append(t_score2)

ExpressionData_DopaminergicNeurons_T_2 = pd.DataFrame({'Gene name': ExpressionData_DopaminergicNeurons_T['Gene name']})
for neuron_type in ['ADE', 'PDE', 'CEP']:
    ExpressionData_DopaminergicNeurons_T_2[f'{neuron_type}_t_score2'] = t_scores2[neuron_type]

    
# ExpressionData_DopaminergicNeurons_T_2.to_csv('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data\\ExpressionData_DopaminergicNeurons_T.csv', index=False)
print(ExpressionData_DopaminergicNeurons_T_2.head())

  Gene name  ADE_t_score2  PDE_t_score2  CEP_t_score2
0    flp-33     86.882787     14.631959      0.851163
1    unc-54     68.809301     70.458295    110.468799
2  F09E10.7     52.433639    102.234107     70.074016
3     flp-9     35.994952      0.790589      0.530800
4   F59F3.6     24.591672     23.360470     13.854515


## 3 - Combine DataFrames

### 3.X - Use pd.DataFrame.merge to combine gene expression and phenotype dataframes only for dopaminergic neurons

In [13]:
Phenotype_df = pd.read_csv('phenotype_heatmap_dataframe.csv')
ExpressionData_DopaminergicNeurons_df = pd.read_csv('ExpressionData_DopaminergicNeurons_T.csv')

# Step 1: Add an extra column with just the gene name of each genotype in dataframe 1
Phenotype_df['Gene name'] = Phenotype_df['Genotype'].str.split('(').str[0]

# Step 2: Merge the two dataframes by gene name
MergedDopaminergicNeurons_df = pd.merge(Phenotype_df, ExpressionData_DopaminergicNeurons_df, on='Gene name')

# Step 3: Remove the gene name column from the merged dataframe
MergedDopaminergicNeurons_df = MergedDopaminergicNeurons_df.drop(columns=['Gene name'])

# Step 4: Remove all rows (genotypes) with any empty data for any phenotype or expression level measure
MergedDopaminergicNeurons_df = MergedDopaminergicNeurons_df.dropna()

# Set genotype as index
MergedDopaminergicNeurons_df.set_index('Genotype', inplace=True)

# Save the merged DataFrame as a CSV file to a specific directory
MergedDopaminergicNeurons_df.to_csv('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data\\MergedDopaminergicNeurons_df.csv', index=True)

print(MergedDopaminergicNeurons_df)

                     length   midline     width  morphwidth      area  \
Genotype                                                                
cat-2(e1112)      -0.196739  0.034824  0.820231    0.642903  0.220447   
catp-7(tm4438)    -0.153820 -0.593807 -1.044253   -1.314471 -0.877247   
catp-7(tm8556)    -0.222941 -0.540317 -1.118052   -1.812675 -0.986798   
cpr-4(ok3413)     -1.864113 -1.906107 -0.997679   -1.620419 -2.039700   
cpr-5(ok2344)     -2.861594 -2.770649 -0.587235   -1.388041 -2.622915   
cpr-6(tm12210)    -0.242207 -0.668439 -0.973411   -0.796693 -0.632821   
cpr-8(ok2956)     -1.949782 -2.444358 -2.381636   -2.525663 -2.678514   
djr-1.1(tm918)    -2.127705 -2.688475 -2.934645   -3.169808 -2.780524   
djr-1.2(tm817)    -0.763554 -1.326812 -1.995792   -1.266041 -1.006309   
djr-1.2(tm951)    -0.505632 -1.525355 -2.796270   -1.757783 -1.404218   
exp-2(sa26ad1426) -0.762081 -0.772575 -0.363025    0.182574 -0.452432   
kvs-5(tm6152)     -0.745869 -1.237666 -1.422290   -

### 3.X - Use pd.DataFrame.merge to combine gene expression and phenotype dataframes for dopamine-receptor-expressing neurons

### 3.X - Use Pandas xlookup equivalent to sort combined DataFrame by gene name

### 3.X - Delete rows for which there is not both gene expression and phenotype data

### 3.X - use StandardScaler to normalize the distribution of each column (phenotype measure or neuron cell type) to mean = 0 and SD = 1; this will standardize the data neatly before performing clustering

In [None]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print(scaler.fit(data))
print(scaler.mean_)
print(scaler.transform(data))
print(scaler.transform([[2, 2]]))