In [10]:
#Precision Recall f score 

###############################################
#Import Packages
###############################################
import pandas as pd
import numpy as np
from matplotlib import pyplot
import xlsxwriter
import os 
cwd = os.getcwd()
os.chdir(cwd)

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
pd.options.mode.chained_assignment = None

results = [
    r".\9_500_500.xlsx",
    r".\9_750_250.xlsx",
    r".\9_1000_0.xlsx",
    r".\6_500_500.xlsx",
    r".\6_750_250.xlsx",
    r".\6_1000_0.xlsx",
    r".\3_500_500.xlsx",
    r".\3_750_250.xlsx",
    r".\3_1000_0.xlsx"   
]

meta = [
    r".\data\9_500_500_meta.tsv",
    r".\data\9_750_250_meta.tsv",
    r".\data\9_1000_0_meta.tsv",
    r".\data\6_500_500_meta.tsv",
    r".\data\6_750_250_meta.tsv",
    r".\data\6_1000_0_meta.tsv",
    r".\data\3_500_500_meta.tsv",
    r".\data\3_750_250_meta.tsv",
    r".\data\3_1000_0_meta.tsv"   
]

###############################################
###P-R f measure and AUC function
###############################################
#input a=results b=meta
#output F1 score
def func(a, b):
    #Step 1: Input files and preprocess
    ###############################################    
    #Add new column and assign 1 or 0.
    results = pd.read_excel(a)
    meta = pd.read_csv(b, sep='\t')
    
    #for VoomLimma files
    results.rename(columns={'adj.P.Val': 'padj'}, inplace=True)
    results.rename(columns={'logFC': 'log2FoldChange'}, inplace=True)
    
    #for EdgeR files
    results.rename(columns={'FDR': 'padj'}, inplace=True)
    results.rename(columns={'logFC': 'log2FoldChange'}, inplace=True)
    
    
    #Assigning up/down regulation indicator
    ############################################### 
    #creates new column "sig"
    #uses pvalue and logfold change to assign 1 or 0
    #1000_1000 samples
    if a[-8:-5] == "0_0":
        results['sig']=''       
        results.loc[0:999,].loc[(results.padj <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
        
    #500_500 samples
    if a[-8:-5] == "500":
        results['sig']=''
        results.loc[0:499,].loc[(results.padj <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[500:999,].loc[(results.padj <= 0.05) & (results.log2FoldChange <= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0
        
    #750_250 samples
    if a[-8:-5] == "250":
        results['sig']=''
        results.loc[0:749,].loc[(results.padj <= 0.05) & (results.log2FoldChange >= 0),'sig']=1
        results.loc[750:999,].loc[(results.padj <= 0.05) & (results.log2FoldChange <= 0),'sig']=1
        results.loc[results.sig != 1,'sig']=0

    #print(results)
    ############################################### 
    
    
    #Step 2: Fit the Logistic Regression Model
    ###############################################
    #define the predictor variables and the response variable
    X = results[['sig']]
    y = np.ravel(meta[['differential.expression']])

    #split the dataset into training (70%) and testing (30%) sets
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

    #instantiate the model
    log_regression = LogisticRegression(solver='lbfgs')

    #fit the model using the training data
    log_regression.fit(X_train,y_train)
    
    # predict probabilities
    lr_probs = log_regression.predict_proba(X_test)
    
    # keep probabilities for the positive outcome only
    lr_probs = lr_probs[:, 1]

    #predict class values
    yhat = log_regression.predict(X_test)
    lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
    lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)

    #use model to predict probability that given y value is 1
    y_pred_proba = log_regression.predict_proba(X_test)[::,1]

    # summarize scores
    print(f'{a} Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
    
#     ### plot the precision-recall curves
#     no_skill = len(y_test[y_test==1]) / len(y_test)
#     pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
#     pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
#     ### axis labels
#     pyplot.xlabel('Recall')
#     pyplot.ylabel('Precision')
#     ### show the legend
#     pyplot.legend()
#     ### show the plot
#     pyplot.show()
    
    return lr_f1

###############################################
#generate list of results
###############################################
resultsList = []
for i in range(len(results)):
    
    result = func(results[i],meta[i])
    resultsList.append([results[i],result])

###############################################
###Output results
###############################################
#Input: resultslist
#output: Results.xlsx

workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column(0, 8, 16)

for i in range(len(resultsList)):
    worksheet.write(0,i, resultsList[i][0])
    worksheet.write(1,i, resultsList[i][1])
workbook.close()

      log2FoldChange    AveExpr          t       P.Value          padj  \
0           0.670719   1.181780   1.498266  1.500063e-01  6.891603e-01   
1           1.142493  -0.893876   2.196354  4.028511e-02  3.767355e-01   
2           1.265688   3.068731   3.901231  9.152189e-04  1.952244e-02   
3           0.364449   5.236109   1.056346  3.036644e-01  8.369824e-01   
4           1.936861   2.729389   2.986534  7.408810e-03  1.127334e-01   
5           1.608199   4.464693   7.380307  4.499775e-07  3.283522e-05   
6           1.362687  -1.319282   2.457177  2.347032e-02  2.657223e-01   
7           2.718865   8.004977  26.019630  1.161016e-16  1.160667e-12   
8           1.827377   3.152238   4.856285  1.011123e-04  3.044639e-03   
9           0.496771   6.403711   1.609930  1.234101e-01  6.412792e-01   
10          1.050572   2.578457   4.226339  4.310512e-04  1.051029e-02   
11          1.132665   3.675958   3.284981  3.778093e-03  6.614639e-02   
12          1.665434   4.304507   9.37

      log2FoldChange    AveExpr         t       P.Value          padj  \
0           1.018479   7.939842  5.763813  1.314728e-05  6.259982e-04   
1           1.591304   4.105289  6.295278  4.163327e-06  2.328634e-04   
2           1.463670   6.698596  8.419742  6.084526e-08  7.419412e-06   
3           1.281846   8.235036  7.454546  3.849967e-07  3.204863e-05   
4           0.730471  10.622801  4.963428  7.888684e-05  2.758005e-03   
5           1.838021   0.182883  4.446549  2.582964e-04  7.486103e-03   
6           1.486195   7.315292  7.977752  1.394652e-07  1.394512e-05   
7           1.342985   6.662824  2.884329  9.291061e-03  1.365255e-01   
8           1.280947   6.454872  4.309058  3.549468e-04  9.804180e-03   
9           1.194983  10.461753  4.210031  4.464186e-04  1.203164e-02   
10          0.566411   2.757046  1.969270  6.320988e-02  4.389136e-01   
11          1.614354   1.123943  3.676514  1.535803e-03  3.463017e-02   
12          1.289723   4.078476  3.660970  1.591921

      log2FoldChange   AveExpr          t       P.Value      padj          B  \
0           1.517838  2.120700   2.973003  1.032015e-02  0.177574  -2.764469   
1           1.185300  5.709540   4.586954  4.517999e-04  0.016790  -0.047749   
2           0.587082  8.628306   4.001640  1.377962e-03  0.038479  -1.328317   
3           2.965791  4.725489  12.575527  6.929729e-09  0.000006  10.485986   
4           1.889599  4.380727   3.148792  7.308185e-03  0.141589  -2.659050   
5          -0.251026  0.590755  -0.282357  7.819104e-01  0.977621  -6.005940   
6           1.715688  5.268545   3.671022  2.621246e-03  0.062690  -1.747889   
7           1.761863 -2.177350   1.390292  1.867184e-01  0.739286  -4.838753   
8           0.545789  6.052102   2.765882  1.546788e-02  0.233450  -3.553631   
9           1.300023 -0.750849   0.804707  4.347961e-01  0.894250  -5.549745   
10          0.404261  7.061473   1.980118  6.823777e-02  0.523952  -5.017308   
11          0.460576  4.901935   2.34969

      log2FoldChange   AveExpr         t   P.Value      padj         B  sig
0          -0.616538 -1.081496 -0.881294  0.405128  0.897571 -5.112675    0
1           0.666937  1.336178  0.915973  0.387781  0.891380 -5.380355    0
2           0.055262  3.574162  0.088376  0.931859  0.991115 -6.063419    0
3           0.211616  8.500606  0.882969  0.404278  0.897571 -6.116653    0
4          -0.346316  2.783499 -0.378245  0.715579  0.961026 -5.891445    0
5           1.394870  9.818554  4.978114  0.001254  0.102919 -0.600177    0
6           1.065491 -1.196809  1.244296  0.250341  0.833577 -4.801509    0
7           0.688643  3.538590  1.350096  0.215756  0.805496 -5.192913    0
8           1.292027  1.322981  2.571592  0.034419  0.463885 -3.424129    0
9           1.455510  5.279940  4.778745  0.001599  0.115749 -0.732183    0
10          1.957539  7.901525  4.964789  0.001274  0.102919 -0.576974    0
11          0.586677  3.883987  1.388802  0.204178  0.794854 -5.187544    0
12          

  'precision', 'predicted', average, warn_for)


      log2FoldChange    AveExpr         t   P.Value      padj         B  sig
0           1.074963   3.500369  2.344389  0.048509  0.517316 -3.877275    0
1           1.015139   0.918078  1.640093  0.141354  0.697722 -4.573130    0
2           1.419495   7.843406  5.688645  0.000542  0.069052  0.272526    0
3           0.282098   5.482911  0.890655  0.400265  0.880742 -5.870153    0
4           0.338821  -1.133937  0.409917  0.693108  0.949243 -5.355807    0
5           1.783356   3.854033  4.599948  0.001974  0.115798 -0.919054    0
6           2.105331   3.848334  3.816201  0.005567  0.198537 -1.874032    0
7           1.173032   4.925889  2.110606  0.069391  0.591329 -4.330223    0
8           3.581033   0.243704  3.155267  0.014292  0.314350 -2.702027    0
9           1.789170   8.709427  4.054483  0.004023  0.173112 -1.753071    0
10          2.142750   9.170063  6.020430  0.000377  0.064779  0.615617    0
11          1.385211   1.348838  3.037557  0.017004  0.341653 -2.819754    0

In [None]:
#accuracy
#error rate
