## Data Cleaner Script for WGCNA Application

### Install Packages and Modules

In [2]:
%%sh
pip install gffutils # https://pythonhosted.org/pyfaidx/
pip install pyfaidx # https://pythonhosted.org/gffutils/contents.html
pip install biopython


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip


#### Import modules

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import gffutils
import pyfaidx

#### Python Analysis

In [4]:
# Notebook specific behavior setup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 10

%matplotlib inline 
%config InlineBackend.figure_format = 'svg'
plt.style.use("ggplot")

#### Convert xlsx to csv

In [5]:
read_file = pd.read_excel (r'NC_Skin.xlsx')
read_file.to_csv (r'NC_Skin.csv', index = None, header=True)

#### Load Data

In [6]:
data = "NC_Skin.csv"

In [7]:
#I'm creating a first general dataframe common to every strategy in order to clean it and perfor some check, before creating every single df for each strategy
df = pd.read_csv(data)
df

Unnamed: 0.1,Unnamed: 0,BM2_LSI1,BM2_LSI2,BM2_LSI3,BM2_HSI1,...,M_HSI2,M_HSI3,M_LSI1,M_LSI2,M_LSI3
0,Vitvi01g01505,149.727759,189.807879,287.808498,133.552004,...,91.333722,100.279880,107.574104,90.424714,116.289493
1,Vitvi01g01506,176.430799,166.694743,146.833101,185.785677,...,167.637844,235.843420,134.026752,249.219334,204.016655
2,Vitvi06g01795,0.000000,0.000000,0.781027,0.593564,...,1.156123,3.714070,5.290530,8.821923,0.000000
3,Vitvi01g01502,795.369115,727.013206,725.574206,722.961517,...,596.559500,633.248869,798.869982,626.356556,634.491796
4,Vitvi04g01846,2.861040,0.000000,7.029244,4.748516,...,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
26380,Vitvi07g02900,0.000000,47.627069,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000
26381,Vitvi07g02901,2.861040,0.000000,0.000000,5.342080,...,0.000000,0.000000,0.000000,0.000000,0.000000
26382,Vitvi07g02904,207.902239,219.924997,306.553149,292.627281,...,0.000000,0.000000,0.000000,0.000000,0.000000
26383,Vitvi06g00228,49.591360,39.222292,47.642655,42.143077,...,33.527569,53.854009,38.797218,30.876732,24.481999


In [8]:
# Check for Not Null elements in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26385 entries, 0 to 26384
Data columns (total 36 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  26385 non-null  object 
 1   BM2_LSI1    26385 non-null  float64
 2   BM2_LSI2    26385 non-null  float64
 3   BM2_LSI3    26385 non-null  float64
 4   BM2_HSI1    26385 non-null  float64
 5   BM2_HSI2    26385 non-null  float64
 6   BM2_HSI3    26385 non-null  float64
 7   BM3_LSI1    26385 non-null  float64
 8   BM3_LSI2    26385 non-null  float64
 9   BM3_LSI3    26385 non-null  float64
 10  BM3_HSI1    26385 non-null  float64
 11  BM3_HSI2    26385 non-null  float64
 12  BM3_HSI3    26385 non-null  float64
 13  R1_LSI1     26385 non-null  float64
 14  R1_LSI2     26385 non-null  float64
 15  R1_LSI3     26385 non-null  float64
 16  R1_HSI1     26385 non-null  float64
 17  R1_HSI2     26385 non-null  float64
 18  R1_HSI3     26385 non-null  float64
 19  R2_LSI1     26385 non-nul

In [9]:
#rename the first column
df.rename(columns = {'Unnamed: 0':'Gene ID'}, inplace = True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26385 entries, 0 to 26384
Data columns (total 36 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Gene ID   26385 non-null  object 
 1   BM2_LSI1  26385 non-null  float64
 2   BM2_LSI2  26385 non-null  float64
 3   BM2_LSI3  26385 non-null  float64
 4   BM2_HSI1  26385 non-null  float64
 5   BM2_HSI2  26385 non-null  float64
 6   BM2_HSI3  26385 non-null  float64
 7   BM3_LSI1  26385 non-null  float64
 8   BM3_LSI2  26385 non-null  float64
 9   BM3_LSI3  26385 non-null  float64
 10  BM3_HSI1  26385 non-null  float64
 11  BM3_HSI2  26385 non-null  float64
 12  BM3_HSI3  26385 non-null  float64
 13  R1_LSI1   26385 non-null  float64
 14  R1_LSI2   26385 non-null  float64
 15  R1_LSI3   26385 non-null  float64
 16  R1_HSI1   26385 non-null  float64
 17  R1_HSI2   26385 non-null  float64
 18  R1_HSI3   26385 non-null  float64
 19  R2_LSI1   26385 non-null  float64
 20  R2_LSI2   26385 non-null  fl

Unnamed: 0,Gene ID,BM2_LSI1,BM2_LSI2,BM2_LSI3,BM2_HSI1,...,M_HSI2,M_HSI3,M_LSI1,M_LSI2,M_LSI3
0,Vitvi01g01505,149.727759,189.807879,287.808498,133.552004,...,91.333722,100.279880,107.574104,90.424714,116.289493
1,Vitvi01g01506,176.430799,166.694743,146.833101,185.785677,...,167.637844,235.843420,134.026752,249.219334,204.016655
2,Vitvi06g01795,0.000000,0.000000,0.781027,0.593564,...,1.156123,3.714070,5.290530,8.821923,0.000000
3,Vitvi01g01502,795.369115,727.013206,725.574206,722.961517,...,596.559500,633.248869,798.869982,626.356556,634.491796
4,Vitvi04g01846,2.861040,0.000000,7.029244,4.748516,...,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
26380,Vitvi07g02900,0.000000,47.627069,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000
26381,Vitvi07g02901,2.861040,0.000000,0.000000,5.342080,...,0.000000,0.000000,0.000000,0.000000,0.000000
26382,Vitvi07g02904,207.902239,219.924997,306.553149,292.627281,...,0.000000,0.000000,0.000000,0.000000,0.000000
26383,Vitvi06g00228,49.591360,39.222292,47.642655,42.143077,...,33.527569,53.854009,38.797218,30.876732,24.481999


In [10]:
#reorder columns
df = pd.DataFrame(df[['Gene ID','BM2_LSI1','BM2_LSI2','BM2_LSI3','BM2_HSI1','BM2_HSI2','BM2_HSI3', 'BM3_LSI1','BM3_LSI2','BM3_LSI3','BM3_HSI1','BM3_HSI2','BM3_HSI3', 'R1_LSI1', 'R1_LSI2', 'R1_LSI3', 'R1_HSI1', 'R1_HSI2', 'R1_HSI3', 'M_HSI1', 'M_HSI2', 'M_HSI3', 'M_LSI1', 'M_LSI2', 'M_LSI3', 'R2_LSI1', 'R2_LSI2', 'R2_LSI3', 'R2_HSI1', 'R2_HSI2', 'PR_HSI1', 'PR_HSI2', 'PR_HSI3', 'PR_HSI4', 'PR_LSI1', 'PR_LSI2']])
df.info()





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26385 entries, 0 to 26384
Data columns (total 36 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Gene ID   26385 non-null  object 
 1   BM2_LSI1  26385 non-null  float64
 2   BM2_LSI2  26385 non-null  float64
 3   BM2_LSI3  26385 non-null  float64
 4   BM2_HSI1  26385 non-null  float64
 5   BM2_HSI2  26385 non-null  float64
 6   BM2_HSI3  26385 non-null  float64
 7   BM3_LSI1  26385 non-null  float64
 8   BM3_LSI2  26385 non-null  float64
 9   BM3_LSI3  26385 non-null  float64
 10  BM3_HSI1  26385 non-null  float64
 11  BM3_HSI2  26385 non-null  float64
 12  BM3_HSI3  26385 non-null  float64
 13  R1_LSI1   26385 non-null  float64
 14  R1_LSI2   26385 non-null  float64
 15  R1_LSI3   26385 non-null  float64
 16  R1_HSI1   26385 non-null  float64
 17  R1_HSI2   26385 non-null  float64
 18  R1_HSI3   26385 non-null  float64
 19  M_HSI1    26385 non-null  float64
 20  M_HSI2    26385 non-null  fl

In [13]:
df_passed = df
df_declined = df

for r in range(df.shape[0]):
    count = 0
    for i in range(1, 25, 6):
        if ((df.iloc[r, i] != 0 and df.iloc[r, i+1] != 0 and df.iloc[r, i+2] != 0 and df.iloc[r, i+3] != 0 and df.iloc[r, i+4] != 0 and df.iloc[r, i+5] != 0) and ((df.iloc[r, i] + df.iloc[r, i+1] + df.iloc[r, i+2]) > 4.70043972) and ((df.iloc[r, i+3] + df.iloc[r, i+4] + df.iloc[r, i+5]) > 4.70043972)):
            count += 1
    for i in range(25, 26):
        if ((df.iloc[r, i] != 0 and df.iloc[r, i+1] != 0 and df.iloc[r, i+2] != 0 and df.iloc[r, i+3] != 0 and df.iloc[r, i+4] != 0) and ((df.iloc[r, i] + df.iloc[r, i+1] + df.iloc[r, i+2]) > 4.70043972) and ((df.iloc[r, i+3] + df.iloc[r, i+4]) > 4.70043972)):
            count += 1
    for i in range(30, 31):
        if ((df.iloc[r, i] != 0 and df.iloc[r, i+1] != 0 and df.iloc[r, i+2] != 0 and df.iloc[r, i+3] != 0 and df.iloc[r, i+4] != 0 and df.iloc[r, i+5] != 0) and ((df.iloc[r, i] + df.iloc[r, i+1] + df.iloc[r, i+2] + df.iloc[r, i+3]) > 4.70043972) and ((df.iloc[r, i+4] + df.iloc[r, i+5]) > 4.70043972)):
            count += 1
            
    if count != 6:
        df_passed = df_passed.drop(df.index[r])
    if count == 6:
        df_declined = df_declined.drop(df.index[r])

        
df_passed.shape
df_declined.shape
df.shape

(14886, 36)

(11499, 36)

(26385, 36)

In [14]:
df_passed.head()
df_passed.eq(0).any().any()
df_declined.eq(0).any().any()

Unnamed: 0,Gene ID,BM2_LSI1,BM2_LSI2,BM2_LSI3,BM2_HSI1,...,PR_HSI2,PR_HSI3,PR_HSI4,PR_LSI1,PR_LSI2
0,Vitvi01g01505,149.727759,189.807879,287.808498,133.552004,...,86.267777,96.269348,72.378576,107.600259,84.33166
1,Vitvi01g01506,176.430799,166.694743,146.833101,185.785677,...,151.804538,140.138924,146.365565,155.422596,161.059647
3,Vitvi01g01502,795.369115,727.013206,725.574206,722.961517,...,583.812167,311.352131,477.162465,560.584064,560.598169
6,Vitvi13g01113,997.549274,916.120687,877.874496,971.071462,...,1108.774998,1286.8409,1140.364678,1200.872023,1242.855127
7,Vitvi13g01110,522.616637,561.719259,515.087394,551.421387,...,424.651462,449.053856,391.380449,323.686375,313.824376


False

True

In [15]:
outputfile= 'NC_Skin_Passed.csv'
df_passed.to_csv(outputfile,index=False)

outputfile= 'NC_Skin_Declined.csv'
df_declined.to_csv(outputfile,index=False)

In [16]:
read_file = pd.read_csv (r'NC_Skin_Passed.csv')
read_file.to_excel (r'NC_Skin_Passed.xlsx', index = None, header=True)

read_file = pd.read_csv (r'NC_Skin_Declined.csv')
read_file.to_excel (r'NC_Skin_Declined.xlsx', index = None, header=True)

*Notebook Created By: Christian Mandelli, Oregon State University*