This notebook contains the initial trimming of the EDTA -/+ 03JUN2022 (26APR22 - RUN) elut files

In [1]:
from matplotlib.pyplot import figure, show, cm

import pandas as pd
import numpy as np
import itertools as it
import scipy
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy import stats
import re
import scipy.cluster.hierarchy as sch

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
mpl.rc('pdf', fonttype=42)
import seaborn as sns

import random

from collections import Counter
import operator

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve

Need to remove one of the fractions from the treatment (EDTA+) condition due to a fraction not filtering well. So, fraction 33 (C9) was not ran in the control condition (EDTA-) and in order to run analyses using the two DFs, they need to be the same length.

In [2]:
#sfisch6: Creating data frames for the two elut files

EDTA_minus_df = pd.read_csv('/stor/MS/processed/UIC_core/QEHF/MSC977/HEK293_EDTA_minus_SEC_control/reviewed/HEK293_EDTA_minus_SEC_control_20220626.elut', sep="\t", index_col=0)
EDTA_plus_df = pd.read_csv('/stor/MS/processed/UIC_core/QEHF/MSC977/HEK293_EDTA_plus_SEC_treatment/reviewed/HEK293_EDTA_plus_SEC_treatment_20220626.elut', sep="\t", index_col=0)

Looking at the DFs

In [4]:
EDTA_minus_df.head()

Unnamed: 0,HEK293_EDTA_minus_SEC_control_12a_20220603,HEK293_EDTA_minus_SEC_control_13a_20220603,HEK293_EDTA_minus_SEC_control_14a_20220603,HEK293_EDTA_minus_SEC_control_15a_20220603,HEK293_EDTA_minus_SEC_control_16a_20220603,HEK293_EDTA_minus_SEC_control_17a_20220603,HEK293_EDTA_minus_SEC_control_18a_20220603,HEK293_EDTA_minus_SEC_control_19a_20220603,HEK293_EDTA_minus_SEC_control_20a_20220603,HEK293_EDTA_minus_SEC_control_21a_20220603,...,HEK293_EDTA_minus_SEC_control_41a_20220603,HEK293_EDTA_minus_SEC_control_42a_20220603,HEK293_EDTA_minus_SEC_control_43a_20220603,HEK293_EDTA_minus_SEC_control_44a_20220603,HEK293_EDTA_minus_SEC_control_45a_20220603,HEK293_EDTA_minus_SEC_control_46a_20220603,HEK293_EDTA_minus_SEC_control_47a_20220603,HEK293_EDTA_minus_SEC_control_48a_20220603,HEK293_EDTA_minus_SEC_control_49a_20220603,HEK293_EDTA_minus_SEC_control_50a_20220603
A0A024RBG1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6I1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6I4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6Q5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6S4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [5]:
EDTA_plus_df.head()

Unnamed: 0,HEK293_EDTA_plus_SEC_treatment_12a_20220603,HEK293_EDTA_plus_SEC_treatment_13a_20220603,HEK293_EDTA_plus_SEC_treatment_14a_20220603,HEK293_EDTA_plus_SEC_treatment_15a_20220603,HEK293_EDTA_plus_SEC_treatment_16a_20220603,HEK293_EDTA_plus_SEC_treatment_17a_20220603,HEK293_EDTA_plus_SEC_treatment_18a_20220603,HEK293_EDTA_plus_SEC_treatment_19a_20220603,HEK293_EDTA_plus_SEC_treatment_20a_20220603,HEK293_EDTA_plus_SEC_treatment_21a_20220603,...,HEK293_EDTA_plus_SEC_treatment_41a_20220603,HEK293_EDTA_plus_SEC_treatment_42a_20220603,HEK293_EDTA_plus_SEC_treatment_43a_20220603,HEK293_EDTA_plus_SEC_treatment_44a_20220603,HEK293_EDTA_plus_SEC_treatment_45a_20220603,HEK293_EDTA_plus_SEC_treatment_46a_20220603,HEK293_EDTA_plus_SEC_treatment_47a_20220603,HEK293_EDTA_plus_SEC_treatment_48a_20220603,HEK293_EDTA_plus_SEC_treatment_49a_20220603,HEK293_EDTA_plus_SEC_treatment_50a_20220603
A0A024RBG1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6N4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A075B6T7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A087WSZ9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
#sfisch6: Making sure I have the correct column location; [rows, columns]
EDTA_plus_df.iloc[:, 21]

A0A024RBG1    0.0
A0A075B6N4    0.0
A0A075B6S4    0.0
A0A075B6T7    0.0
A0A087WSZ9    0.0
             ... 
Q9Y6X8        0.0
Q9Y6X9        0.0
Q9Y6Y0        0.0
Q9Y6Y1        0.0
Q9Y6Y8        2.0
Name: HEK293_EDTA_plus_SEC_treatment_33a_20220603, Length: 11866, dtype: float64

In [9]:
EDTA_plus_df.columns[21]

'HEK293_EDTA_plus_SEC_treatment_33a_20220603'

In [10]:
EDTA_plus_trimmed_df = EDTA_plus_df.drop(EDTA_plus_df.columns[21], axis=1)

In [12]:
EDTA_plus_trimmed_df.columns[21]

'HEK293_EDTA_plus_SEC_treatment_34a_20220603'

In [13]:
EDTA_plus_trimmed_df.columns[20]

'HEK293_EDTA_plus_SEC_treatment_32a_20220603'

In [15]:
#sfisch6: Save the new dataframe as a CSV file to import the trimmed DF later

EDTA_plus_trimmed_df.to_csv("HEK293_EDTA_plus_SEC_treatment_20220626_trimmed.elut", sep="\t")

In [16]:
len(EDTA_plus_trimmed_df.columns)

38

In [17]:
len(EDTA_minus_df.columns)

38

In [None]:
#sfisch6: Need to upload some of the other control/0M replicates
#sfisch6: Looking at 0M Urea replicate, the phospho replicates, RNA diffrac (-)