# Contents

#### 1. Prepare to work with data (import packages and set working directory)
#### 2. Read in and process data (create separate dataframes for gene expression data and phenotype data)
#### 3. Combine dataframes (with gene name as index)

## 1 - Prepare to work with data

### 1.1 - Import Packages

In [20]:
import os  # <- package used to work with system filepaths
import pandas as pd  # <- package used to import and organize data
import numpy as np  # <- package used to import and organize data
import math
import seaborn as sns  # <- package used to plot graphs
from matplotlib import pyplot as plt  # <- another package used to plot graphs
from itertools import cycle  # <- package used to iterate down rows
from ipywidgets import widgets  # <- widget tool to generate button
from IPython.display import display  # <- displays button
from tkinter import Tk, filedialog  # <- Tkinter is a GUI package
from tqdm.notebook import tqdm
import pingouin as pg
from pingouin import ttest
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
import requests
pd.set_option('display.max_columns', 50)
print("done step 1")

done step 1


### 1.2 - Set working directory to where the CSV file with experiment data is


In [12]:
os.getcwd()
os.chdir('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data')

## 2 - Read in and tidy data


### 2.1.A - Read in and combine CeNGEN data into one dataframe (for dopaminergic (dopamine-synthesizing) neurons)

In [19]:
df1 = pd.read_csv('GenesExpressed_in_ADE-thrsUnfiltered.csv')
df2 = pd.read_csv('GenesExpressed_in_PDE-thrsUnfiltered.csv')
df3 = pd.read_csv('GenesExpressed_in_CEP-thrsUnfiltered.csv')

ExpressionData_DopaminergicNeurons = pd.merge(df1, df2, on='Gene name', how='outer')
ExpressionData_DopaminergicNeurons = pd.merge(ExpressionData_DopaminergicNeurons, df3, on='Gene name', how='outer')
ExpressionData_DopaminergicNeurons = ExpressionData_DopaminergicNeurons.drop(['Unnamed: 0_x','Gene ID_x', 'Unnamed: 0_y', 'Gene ID_y', 'Unnamed: 0'], axis=1)

# Rearrange columns
new_column_order = ['Gene name', 'ADE_exp_level', 'PDE_exp_level', 'CEP_exp_level']
ExpressionData_DopaminergicNeurons = ExpressionData_DopaminergicNeurons.reindex(columns=new_column_order)

# ExpressionData_DopaminergicNeurons = ExpressionData_DopaminergicNeurons.dropna()

# Save the merged DataFrame as a CSV file to a specific directory
ExpressionData_DopaminergicNeurons.to_csv('C:\\Users\\chris\\Desktop\\Rankin_Lab\\valid_data\\ExpressionData_DopaminergicNeurons.csv', index=True)


print(ExpressionData_DopaminergicNeurons.shape)
print(ExpressionData_DopaminergicNeurons.head())

(18475, 4)
  Gene name  ADE_exp_level  PDE_exp_level  CEP_exp_level
0    flp-33       98000.31       16699.05       1082.030
1    unc-54       77625.40       80205.59     133461.200
2  F09E10.7       59164.51      116352.90      84678.650
3     flp-9       40632.57         953.48        695.145
4   F59F3.6       27777.23       26628.37      16785.460


### 2.1.B - Read in and combine CeNGEN data into one dataframe (for all neurons with dopamine-receptors)

### 2.2 - Calculate mean gene expression level across each neuron and calculate a t-score for the expression level of each gene in that neuron

In [22]:
# Calculate t-score for each gene in each neuron type
t_scores = {}
for neuron_type in ['ADE', 'PDE', 'CEP']:
    # Calculate mean expression level and standard deviation for the current neuron type
    mean_expression = ExpressionData_DopaminergicNeurons[f'{neuron_type}_exp_level'].mean()
    sd_expression = ExpressionData_DopaminergicNeurons[f'{neuron_type}_exp_level'].std()

    # Initialize list to store t-scores
    t_scores[neuron_type] = []
    
    # Calculate t-score for each gene in the current neuron type
    for index, row in ExpressionData_DopaminergicNeurons.iterrows():
        t_score = (row[f'{neuron_type}_exp_level'] - mean_expression) / sd_expression
        t_scores[neuron_type].append(t_score)

# Create a new DataFrame with gene names and corresponding t-scores for each neuron type
ExpressionData_DopaminergicNeurons_T = pd.DataFrame({'Gene name': ExpressionData_DopaminergicNeurons['Gene name']})
for neuron_type in ['ADE', 'PDE', 'CEP']:
    ExpressionData_DopaminergicNeurons_T[f'{neuron_type}_t_score'] = t_scores[neuron_type]

print(ExpressionData_DopaminergicNeurons_T.head())


  Gene name  ADE_t_score  PDE_t_score  CEP_t_score
0    flp-33    86.882787    14.631959     0.851163
1    unc-54    68.809301    70.458295   110.468799
2  F09E10.7    52.433639   102.234107    70.074016
3     flp-9    35.994952     0.790589     0.530800
4   F59F3.6    24.591672    23.360470    13.854515


### 2.2.2 (Maybe 3.X) - Standardize t-scores (just as phenotype t-scores were)

## 3 - Combine DataFrames

### 3.X - Use pd.DataFrame.merge to combine gene expression and phenotype dataframes only for dopaminergic neurons

### 3.X - Use pd.DataFrame.merge to combine gene expression and phenotype dataframes for dopamine-receptor-expressing neurons

### 3.X - Use Pandas xlookup equivalent to sort combined DataFrame by gene name

### 3.X - Delete rows for which there is not both gene expression and phenotype data