## Data Cleaner Script for WGCNA Application

### Install Packages and Modules

In [1]:
%%sh
pip install gffutils # https://pythonhosted.org/pyfaidx/
pip install pyfaidx # https://pythonhosted.org/gffutils/contents.html
pip install biopython


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip

[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: pip install --upgrade pip


#### Import modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import gffutils
import pyfaidx

#### Python Analysis

In [3]:
# Notebook specific behavior setup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 10

%matplotlib inline 
%config InlineBackend.figure_format = 'svg'
plt.style.use("ggplot")

#### Convert xlsx to csv

In [4]:
read_file = pd.read_excel (r'NC_Seed.xlsx')
read_file.to_csv (r'NC_Seed.csv', index = None, header=True)

#### Load Data

In [5]:
data = "NC_Seed.csv"

In [6]:
#I'm creating a first general dataframe common to every strategy in order to clean it and perfor some check, before creating every single df for each strategy
df = pd.read_csv(data)
df

Unnamed: 0.1,Unnamed: 0,BM2_LSI1,BM2_LSI2,BM2_LSI3,BM2_HSI1,...,PR_LSI3,M_LSI1,M_LSI2,M_HSI2,M_HSI3
0,Vitvi01g01505,97.600679,128.660895,86.693475,95.578843,...,117.701408,101.980749,0.000000,55.602018,94.693573
1,Vitvi01g01506,36.871367,42.476563,47.092752,37.689595,...,43.061491,61.188449,0.000000,31.388236,53.265135
2,Vitvi06g01795,0.000000,0.000000,0.000000,0.739012,...,2.870766,0.000000,0.000000,1.345210,0.000000
3,Vitvi01g01502,201.708069,261.015404,517.485125,487.501367,...,215.307453,326.338397,150.501312,137.211431,345.236985
4,Vitvi04g01846,6.506712,1.846807,13.913768,15.765582,...,7.176915,0.000000,0.000000,78.470590,13.809479
...,...,...,...,...,...,...,...,...,...,...,...
26887,Vitvi07g02900,0.000000,0.000000,1.605435,0.000000,...,5.741532,0.000000,0.000000,1.345210,7.891131
26888,Vitvi07g02901,0.000000,1.846807,0.000000,0.492674,...,0.000000,0.000000,0.000000,0.000000,0.000000
26889,Vitvi07g02904,0.000000,0.000000,0.000000,0.985349,...,0.000000,0.000000,0.000000,1.345210,0.000000
26890,Vitvi06g00228,28.195752,18.468071,24.616666,15.765582,...,17.224596,0.000000,0.000000,9.416471,0.000000


In [7]:
# Check for Not Null elements in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26892 entries, 0 to 26891
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  26892 non-null  object 
 1   BM2_LSI1    26892 non-null  float64
 2   BM2_LSI2    26892 non-null  float64
 3   BM2_LSI3    26892 non-null  float64
 4   BM2_HSI1    26892 non-null  float64
 5   BM2_HSI2    26892 non-null  float64
 6   BM2_HSI3    26892 non-null  float64
 7   BM3_LSI1    26892 non-null  float64
 8   BM3_LSI2    26892 non-null  float64
 9   BM3_LSI3    26892 non-null  float64
 10  BM3_HSI1    26892 non-null  float64
 11  BM3_HSI2    26892 non-null  float64
 12  BM3_HSI3    26892 non-null  float64
 13  R1_LSI1     26892 non-null  float64
 14  R1_LSI2     26892 non-null  float64
 15  R1_HSI1     26892 non-null  float64
 16  R1_HSI2     26892 non-null  float64
 17  R1_HSI3     26892 non-null  float64
 18  R2_LSI1     26892 non-null  float64
 19  R2_LSI2     26892 non-nul

In [9]:
#rename the first column
df.rename(columns = {'Unnamed: 0':'Gene ID'}, inplace = True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26892 entries, 0 to 26891
Data columns (total 33 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Gene ID   26892 non-null  object 
 1   BM2_LSI1  26892 non-null  float64
 2   BM2_LSI2  26892 non-null  float64
 3   BM2_LSI3  26892 non-null  float64
 4   BM2_HSI1  26892 non-null  float64
 5   BM2_HSI2  26892 non-null  float64
 6   BM2_HSI3  26892 non-null  float64
 7   BM3_LSI1  26892 non-null  float64
 8   BM3_LSI2  26892 non-null  float64
 9   BM3_LSI3  26892 non-null  float64
 10  BM3_HSI1  26892 non-null  float64
 11  BM3_HSI2  26892 non-null  float64
 12  BM3_HSI3  26892 non-null  float64
 13  R1_LSI1   26892 non-null  float64
 14  R1_LSI2   26892 non-null  float64
 15  R1_HSI1   26892 non-null  float64
 16  R1_HSI2   26892 non-null  float64
 17  R1_HSI3   26892 non-null  float64
 18  R2_LSI1   26892 non-null  float64
 19  R2_LSI2   26892 non-null  float64
 20  R2_LSI3   26892 non-null  fl

Unnamed: 0,Gene ID,BM2_LSI1,BM2_LSI2,BM2_LSI3,BM2_HSI1,...,PR_LSI3,M_LSI1,M_LSI2,M_HSI2,M_HSI3
0,Vitvi01g01505,97.600679,128.660895,86.693475,95.578843,...,117.701408,101.980749,0.000000,55.602018,94.693573
1,Vitvi01g01506,36.871367,42.476563,47.092752,37.689595,...,43.061491,61.188449,0.000000,31.388236,53.265135
2,Vitvi06g01795,0.000000,0.000000,0.000000,0.739012,...,2.870766,0.000000,0.000000,1.345210,0.000000
3,Vitvi01g01502,201.708069,261.015404,517.485125,487.501367,...,215.307453,326.338397,150.501312,137.211431,345.236985
4,Vitvi04g01846,6.506712,1.846807,13.913768,15.765582,...,7.176915,0.000000,0.000000,78.470590,13.809479
...,...,...,...,...,...,...,...,...,...,...,...
26887,Vitvi07g02900,0.000000,0.000000,1.605435,0.000000,...,5.741532,0.000000,0.000000,1.345210,7.891131
26888,Vitvi07g02901,0.000000,1.846807,0.000000,0.492674,...,0.000000,0.000000,0.000000,0.000000,0.000000
26889,Vitvi07g02904,0.000000,0.000000,0.000000,0.985349,...,0.000000,0.000000,0.000000,1.345210,0.000000
26890,Vitvi06g00228,28.195752,18.468071,24.616666,15.765582,...,17.224596,0.000000,0.000000,9.416471,0.000000


In [11]:
#reorder columns
df = pd.DataFrame(df[['Gene ID','BM2_LSI1','BM2_LSI2','BM2_LSI3','BM2_HSI1','BM2_HSI2','BM2_HSI3', 'BM3_LSI1','BM3_LSI2','BM3_LSI3','BM3_HSI1','BM3_HSI2','BM3_HSI3', 'R2_LSI1', 'R2_LSI2', 'R2_LSI3', 'R2_HSI1', 'R2_HSI2', 'R2_HSI3', 'R1_LSI1', 'R1_LSI2', 'R1_HSI1', 'R1_HSI2', 'R1_HSI3', 'PR_HSI1', 'PR_HSI2', 'PR_LSI2', 'PR_LSI1', 'PR_LSI3', 'M_LSI1', 'M_LSI2', 'M_HSI2', 'M_HSI3']])
df.info()





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26892 entries, 0 to 26891
Data columns (total 33 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Gene ID   26892 non-null  object 
 1   BM2_LSI1  26892 non-null  float64
 2   BM2_LSI2  26892 non-null  float64
 3   BM2_LSI3  26892 non-null  float64
 4   BM2_HSI1  26892 non-null  float64
 5   BM2_HSI2  26892 non-null  float64
 6   BM2_HSI3  26892 non-null  float64
 7   BM3_LSI1  26892 non-null  float64
 8   BM3_LSI2  26892 non-null  float64
 9   BM3_LSI3  26892 non-null  float64
 10  BM3_HSI1  26892 non-null  float64
 11  BM3_HSI2  26892 non-null  float64
 12  BM3_HSI3  26892 non-null  float64
 13  R2_LSI1   26892 non-null  float64
 14  R2_LSI2   26892 non-null  float64
 15  R2_LSI3   26892 non-null  float64
 16  R2_HSI1   26892 non-null  float64
 17  R2_HSI2   26892 non-null  float64
 18  R2_HSI3   26892 non-null  float64
 19  R1_LSI1   26892 non-null  float64
 20  R1_LSI2   26892 non-null  fl

In [14]:
df_passed = df
df_declined = df

for r in range(df.shape[0]):
    count = 0
    for i in range(1, 19, 6):
        if ((df.iloc[r, i] != 0 and df.iloc[r, i+1] != 0 and df.iloc[r, i+2] != 0 and df.iloc[r, i+3] != 0 and df.iloc[r, i+4] != 0 and df.iloc[r, i+5] != 0) and ((df.iloc[r, i] + df.iloc[r, i+1] + df.iloc[r, i+2]) > 4.70043972) and ((df.iloc[r, i+3] + df.iloc[r, i+4] + df.iloc[r, i+5]) > 4.70043972)):
            count += 1
    for i in range(19, 29, 5):
        if ((df.iloc[r, i] != 0 and df.iloc[r, i+1] != 0 and df.iloc[r, i+2] != 0 and df.iloc[r, i+3] != 0 and df.iloc[r, i+4] != 0) and ((df.iloc[r, i] + df.iloc[r, i+1]) > 4.70043972) and ((df.iloc[r, i+3] + df.iloc[r, i+4] + df.iloc[r, i+2]) > 4.70043972)):
            count += 1
    for i in range(29, 30):
        if ((df.iloc[r, i] != 0 and df.iloc[r, i+1] != 0 and df.iloc[r, i+2] != 0 and df.iloc[r, i+3]) and ((df.iloc[r, i] + df.iloc[r, i+1]) > 4.70043972) and ((df.iloc[r, i+2] + df.iloc[r, i+3]) > 4.70043972)):
            count += 1
            
    if count != 6:
        df_passed = df_passed.drop(df.index[r])
    if count == 6:
        df_declined = df_declined.drop(df.index[r])

        
df_passed.shape
df_declined.shape
df.shape

(9693, 33)

(17199, 33)

(26892, 33)

In [15]:
df_passed.head()
df_passed.eq(0).any().any()
df_declined.eq(0).any().any()

Unnamed: 0,Gene ID,BM2_LSI1,BM2_LSI2,BM2_LSI3,BM2_HSI1,...,PR_LSI3,M_LSI1,M_LSI2,M_HSI2,M_HSI3
3,Vitvi01g01502,201.708069,261.015404,517.485125,487.501367,...,215.307453,326.338397,150.501312,137.211431,345.236985
6,Vitvi13g01113,720.076117,630.376824,572.069906,631.608643,...,918.645132,938.222892,722.406297,866.315309,937.071817
12,Vitvi14g01910,863.223779,666.697363,632.006136,954.556744,...,219.613602,387.526847,361.203148,279.355299,230.815584
15,Vitvi04g00515,1437.98333,1529.771882,1667.511531,1808.361564,...,556.928611,428.319146,401.336832,529.115976,737.820757
17,Vitvi08g01599,620.306535,363.205396,486.446721,393.646884,...,655.970039,571.092195,551.838143,947.924723,824.623199


False

True

In [17]:
outputfile= 'NC_Seed_Passed.csv'
df_passed.to_csv(outputfile,index=False)

outputfile= 'NC_Seed_Declined.csv'
df_declined.to_csv(outputfile,index=False)

In [18]:
read_file = pd.read_csv (r'NC_Seed_Passed.csv')
read_file.to_excel (r'NC_Seed_Passed.xlsx', index = None, header=True)

read_file = pd.read_csv (r'NC_Seed_Declined.csv')
read_file.to_excel (r'NC_Seed_Declined.xlsx', index = None, header=True)

*Notebook Created By: Christian Mandelli, Oregon State University*