In [None]:
#guide https://www.statology.org/auc-in-python/
#Step 1: Import Packages
###############################################
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import xlsxwriter
import os 
cwd = os.getcwd()
os.chdir(cwd)

results = [
    r".\9_500_500.xlsx",
    r".\9_750_250.xlsx",
    r".\9_1000_0.xlsx",
    r".\6_500_500.xlsx",
    r".\6_750_250.xlsx",
    r".\6_1000_0.xlsx",
    r".\3_500_500.xlsx",
    r".\3_750_250.xlsx",
    r".\3_1000_0.xlsx"   
]

meta = [
    r".\data\9_500_500_meta.tsv",
    r".\data\9_750_250_meta.tsv",
    r".\data\9_1000_0_meta.tsv",
    r".\data\6_500_500_meta.tsv",
    r".\data\6_750_250_meta.tsv",
    r".\data\6_1000_0_meta.tsv",
    r".\data\3_500_500_meta.tsv",
    r".\data\3_750_250_meta.tsv",
    r".\data\3_1000_0_meta.tsv"   
]

#input a=results b=meta
#output AUC score
def func(a, b):
    #Step 1: Input files and preprocess
    ###############################################    
    #Add new column and assign 1 or 0.
    results = pd.read_excel(a)
    meta = pd.read_csv(b, sep='\t')
    
    #for VoomLimma files
    results.rename(columns={'p.value': 'pvalue'}, inplace=True)
    #for EdgeR files
    results.rename(columns={'PValue': 'pvalue'}, inplace=True)
    
    #creates new column "sig"
    #uses pvalue to assign 1 or 0
    results['sig']=''
    results.loc[results.pvalue <= 0.05,'sig']=1
    #results.loc[results.padj > 0.05,'sig']=0
    results.loc[results.sig != 1,'sig']=0
    
    #Step 2: Fit the Logistic Regression Model
    ###############################################
    #define the predictor variables and the response variable
    X = results[['sig']]
    y = np.ravel(meta[['differential.expression']])

    #split the dataset into training (70%) and testing (30%) sets
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

    #instantiate the model
    log_regression = LogisticRegression(solver='lbfgs')

    #fit the model using the training data
    log_regression.fit(X_train,y_train)

    #Step 3: Calculate the AUC
    ###############################################
    #use model to predict probability that given y value is 1
    y_pred_proba = log_regression.predict_proba(X_test)[::,1]

    #calculate AUC of model
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    #print AUC score
    #print(auc)
    
    return auc

#generate list of AUCs
resultsList = []
for i in range(len(results)):
    
    result = func(results[i],meta[i])
    resultsList.append([results[i],result])
    
for i in resultsList:    
    print(i)
    
#write to xlsx file
#output: Results.xlsx

workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column(0, 8, 16)
for i in range(9):
    worksheet.write(0,i, resultsList[i][0])
    worksheet.write(1,i, resultsList[i][1])
workbook.close()