In [54]:
#guide https://www.statology.org/auc-in-python/
#Step 1: Import Packages
###############################################
import pandas as pd
import numpy as np
from matplotlib import pyplot
import xlsxwriter
import os 
cwd = os.getcwd()
os.chdir(cwd)
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc


results = [
    r".\9_500_500.xlsx",
    r".\9_750_250.xlsx",
    r".\9_1000_0.xlsx",
    r".\6_500_500.xlsx",
    r".\6_750_250.xlsx",
    r".\6_1000_0.xlsx",
    r".\3_500_500.xlsx",
    r".\3_750_250.xlsx",
    r".\3_1000_0.xlsx"   
]

meta = [
    r".\data\9_500_500_meta.tsv",
    r".\data\9_750_250_meta.tsv",
    r".\data\9_1000_0_meta.tsv",
    r".\data\6_500_500_meta.tsv",
    r".\data\6_750_250_meta.tsv",
    r".\data\6_1000_0_meta.tsv",
    r".\data\3_500_500_meta.tsv",
    r".\data\3_750_250_meta.tsv",
    r".\data\3_1000_0_meta.tsv"   
]

#input a=results b=meta
#output AUC score
def func(a, b):
    #Step 1: Input files and preprocess
    ###############################################    
    #Add new column and assign 1 or 0.
    results = pd.read_excel(a)
    meta = pd.read_csv(b, sep='\t')
    
    #for VoomLimma files
    results.rename(columns={'p.value': 'pvalue'}, inplace=True)
    results.rename(columns={'logFC': 'log2FoldChange'}, inplace=True)
    #for EdgeR files
    results.rename(columns={'PValue': 'pvalue'}, inplace=True)
    results.rename(columns={'logFC': 'log2FoldChange'}, inplace=True)
    
    ##############################################################################################
    #creates new column "sig"
    #uses pvalue and logfold change to assign 1 or 0
    #1000_1000 samples
    if a[-8:-5] == "0_0":
        results['sig']=''
        results.loc[0:999,].loc[(results.pvalue <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
    
    #500_500 samples
    if a[-8:-5] == "500":
        results['sig']=''
        results.loc[0:499,].loc[(results.pvalue <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[500:999,].loc[(results.pvalue <= 0.05) & (results.log2FoldChange <= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
        
    #750_250 samples
    if a[-8:-5] == "250":
        results['sig']=''
        results.loc[0:749,].loc[(results.pvalue <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[750:999,].loc[(results.pvalue <= 0.05) & (results.log2FoldChange <= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
        
#     print(results)
    ##############################################################################################
    
    #Step 2: Fit the Logistic Regression Model
    ###############################################
    #define the predictor variables and the response variable
    X = results[['sig']]
    y = np.ravel(meta[['differential.expression']])

    #split the dataset into training (70%) and testing (30%) sets
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

    #instantiate the model
    log_regression = LogisticRegression(solver='lbfgs')

    #fit the model using the training data
    log_regression.fit(X_train,y_train)
    
    # predict probabilities
    lr_probs = log_regression.predict_proba(X_test)
    
    # keep probabilities for the positive outcome only
    lr_probs = lr_probs[:, 1]

    #predict class values
    yhat = log_regression.predict(X_test)
    lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
    lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)

    #use model to predict probability that given y value is 1
    y_pred_proba = log_regression.predict_proba(X_test)[::,1]

    # summarize scores
    print(f'{a} Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
    
#     ### plot the precision-recall curves
#     no_skill = len(y_test[y_test==1]) / len(y_test)
#     pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
#     pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
#     ### axis labels
#     pyplot.xlabel('Recall')
#     pyplot.ylabel('Precision')
#     ### show the legend
#     pyplot.legend()
#     ### show the plot
#     pyplot.show()
    
    return lr_f1


#generate list of AUCs
resultsList = []
for i in range(9):
    
    result = func(results[i],meta[i])
    resultsList.append([results[i],result])
    
for i in resultsList:    
    print(i)
    
#write to xlsx file
#output: Results.xlsx

workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column(0, 8, 16)

for i in range(len(resultsList)):
    worksheet.write(0,i, resultsList[i][0])
    worksheet.write(1,i, resultsList[i][1])
workbook.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\9_500_500.xlsx Logistic: f1=0.802 auc=0.851


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\9_750_250.xlsx Logistic: f1=0.822 auc=0.863


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\9_1000_0.xlsx Logistic: f1=0.817 auc=0.860


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\6_500_500.xlsx Logistic: f1=0.787 auc=0.841


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\6_750_250.xlsx Logistic: f1=0.759 auc=0.824


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\6_1000_0.xlsx Logistic: f1=0.792 auc=0.845


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\3_500_500.xlsx Logistic: f1=0.653 auc=0.767


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


.\3_750_250.xlsx Logistic: f1=0.593 auc=0.738
.\3_1000_0.xlsx Logistic: f1=0.643 auc=0.762
['.\\9_500_500.xlsx', 0.8024948024948024]
['.\\9_750_250.xlsx', 0.8220858895705522]
['.\\9_1000_0.xlsx', 0.8172484599589321]
['.\\6_500_500.xlsx', 0.7873684210526316]
['.\\6_750_250.xlsx', 0.7586206896551725]
['.\\6_1000_0.xlsx', 0.7924528301886793]
['.\\3_500_500.xlsx', 0.6526806526806527]
['.\\3_750_250.xlsx', 0.5931372549019608]
['.\\3_1000_0.xlsx', 0.6431924882629109]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
