In [12]:
#Precision Recall f score 

###############################################
#Import Packages
###############################################
import pandas as pd
import numpy as np
from matplotlib import pyplot
import xlsxwriter
import os 
cwd = os.getcwd()
os.chdir(cwd)

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
pd.options.mode.chained_assignment = None

results = [
    r".\9_500_500.xlsx",
    r".\9_750_250.xlsx",
    r".\9_1000_0.xlsx",
    r".\6_500_500.xlsx",
    r".\6_750_250.xlsx",
    r".\6_1000_0.xlsx",
    r".\3_500_500.xlsx",
    r".\3_750_250.xlsx",
    r".\3_1000_0.xlsx"   
]

meta = [
    r".\data\9_500_500_meta.tsv",
    r".\data\9_750_250_meta.tsv",
    r".\data\9_1000_0_meta.tsv",
    r".\data\6_500_500_meta.tsv",
    r".\data\6_750_250_meta.tsv",
    r".\data\6_1000_0_meta.tsv",
    r".\data\3_500_500_meta.tsv",
    r".\data\3_750_250_meta.tsv",
    r".\data\3_1000_0_meta.tsv"   
]

###############################################
###P-R f measure and AUC function
###############################################
#input a=results b=meta
#output F1 score
def func(a, b):
    #Step 1: Input files and preprocess
    ###############################################    
    #Add new column and assign 1 or 0.
    results = pd.read_excel(a)
    meta = pd.read_csv(b, sep='\t')
    
    #for VoomLimma files
    results.rename(columns={'adj.P.Val': 'padj'}, inplace=True)
    results.rename(columns={'logFC': 'log2FoldChange'}, inplace=True)
    
    #for EdgeR files
    results.rename(columns={'FDR': 'padj'}, inplace=True)
    results.rename(columns={'logFC': 'log2FoldChange'}, inplace=True)
    
    
    #Assigning up/down regulation indicator
    ############################################### 
    #creates new column "sig"
    #uses pvalue and logfold change to assign 1 or 0
    #1000_1000 samples
    if a[-8:-5] == "0_0":
        results['sig']=''       
        results.loc[0:999,].loc[(results.padj <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
        
    #500_500 samples
    if a[-8:-5] == "500":
        results['sig']=''
        results.loc[0:499,].loc[(results.padj <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[500:999,].loc[(results.padj <= 0.05) & (results.log2FoldChange <= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
        
    #750_250 samples
    if a[-8:-5] == "250":
        results['sig']=''
        results.loc[0:749,].loc[(results.padj <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[750:999,].loc[(results.padj <= 0.05) & (results.log2FoldChange <= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0

    #print(results)
    ############################################### 
    
    
    #Step 2: Fit the Logistic Regression Model
    ###############################################
    #define the predictor variables and the response variable
    y_true = results[['sig']]
    y_pred = np.ravel(meta[['differential.expression']])
    
    f1 = f1_score(y_true, y_pred)
    
    return f1

###############################################
#generate list of results
###############################################
resultsList = []
for i in range(len(results)):
    
    result = func(results[i],meta[i])
    resultsList.append([results[i],result])

###############################################
###Output results
###############################################
#Input: resultslist
#output: Results.xlsx

workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column(0, 8, 16)

for i in range(len(resultsList)):
    worksheet.write(i,0, resultsList[i][0])
    worksheet.write(i,1, resultsList[i][1])
workbook.close()

In [None]:
#accuracy
#error rate
