# Investigate autosomal influence in sex with RaFFE

In [20]:
import errno, os
import functools
import numpy as np
import pandas as pd
from gtfparse import read_gtf
import feature_elimination as fe
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import normalized_mutual_info_score as nmi_score

In [2]:
os.environ['NUMEXPR_MAX_THREADS'] = '32'

In [3]:
def residualize(X, train_index, test_index, null_model, weights):
    null_model_train, null_model_test = null_model.iloc[train_index, :], null_model.iloc[test_index, :]
    weights_train, weights_test = weights.iloc[:, train_index], weights.iloc[:, test_index]
    fit_train = LinearRegression(n_jobs=-1).fit(X.iloc[train_index, :] * weights_train.T, 
                                                null_model_train)
    # Calculate residuals from training data
    residuals_train = X.iloc[train_index, :] - ( np.matmul(np.array(null_model_train), fit_train.coef_) )
    # Normalize residuals
    residuals_train_sd = residuals_train.std(axis=0)
    residuals_train_mean = residuals_train.mean(axis=0)
    residuals_train_norm = (residuals_train - residuals_train_mean) / residuals_train_sd
    # Calcaulte and normalize test residuals
    residuals_test = X.iloc[test_index, :] - ( np.matmul(np.array(null_model_test), fit_train.coef_) )
    residuals_test_norm = (residuals_test - residuals_train_mean) / residuals_train_sd
    return residuals_train_norm, residuals_test_norm


def mkdir_p(directory):
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            

def extract_feature_annotation(pred_feat, feature, path, fold, new_annot):
    # Get important features
    dft = pd.DataFrame.from_records(pred_feat,
                                    columns=['feature_importance', 'Geneid'])
    dft['Fold'] = fold
    # Get gene annotation
    annot = new_annot
    annot = annot[['JunctionID', 'gene_id', 'gene_name', 
                   'seqname', 'start', 'end']].rename(columns={'seqname': 'chrom'})
    annot['ensemblID'] = annot.gene_id.str.replace("\\..*", "")
    pred_df = dft.merge(annot, how='left', left_on='Geneid', right_on='JunctionID')
    pred_df.to_csv(path+'important_features.txt', sep='\t', mode='a',
                   index=False, header=False)


def run_raffe(X_train, X_test, Y_train, Y_test, fold_num, outdir, 
              annotation):
    # Apply random forest classifier
    cla = RandomForestClassifier(n_estimators=200, 
                                 oob_score=True, n_jobs=-1)
    features = X_train.columns
    d, pfirst = fe.feature_elimination_lr(cla, X_train.values, 
                                          Y_train.values, 
                                          features, fold_num, outdir,
                                          elimination_rate=0.1)
    fe.plot_nmi(d, fold_num, outdir)
    fe.plot_acc(d, fold_num, outdir)
    fe.plot_roc(d, fold_num, outdir)
    n_features = max(d, key=lambda x: d[x][1])
    # Fit model
    cla.fit(X_train.values[:, d[n_features][4]], Y_train)
    labels_pred = cla.predict(X_test.values[:, d[n_features][4]])
    # Annotate features
    pred_features = sorted(list(zip(cla.feature_importances_,
                                    X_train.columns[d[n_features][4]])),
                           reverse=True)
    extract_feature_annotation(pred_features, feature, outdir, fold, 
                               annotation)
    # Save output data
    output = dict()
    output['n_features'] = n_features
    output['n_features_all_features'] = pfirst[0]
    output['train_oob_score_nmi_all_features'] = pfirst[1]
    output['train_oob_score_accuracy_all_features'] = pfirst[2]
    output['train_oob_score_roc_all_features'] = pfirst[3]
    output['train_oob_score_accuracy'] = fe.oob_score_accuracy(cla, Y_train)
    output['train_oob_score_nmi'] = fe.oob_score_nmi(cla, Y_train)
    output['train_oob_score_roc'] = fe.oob_score_roc(cla, Y_train)
    output['test_score_nmi'] = nmi_score(Y_test, labels_pred)
    output['test_score_accuracy'] = accuracy_score(Y_test, labels_pred)
    output['test_score_balanced_accuracy'] = balanced_accuracy_score(Y_test, labels_pred)
    output['test_score_roc'] = roc_auc_score(Y_test, labels_pred)
    return output
    

## Use gene annotation to drop genes on sex chromosomes.

In [4]:
%load_ext rpy2.ipython

In [40]:
%%R -o model,expr,weights
load("../../_m/junctions/voomSVA.RData")
model = data.frame(v$design)
expr = as.data.frame(v$E)
weights = as.data.frame(v$weights)
dimnames(weights) = dimnames(expr)

In [6]:
%%R -o new_annot
library(data.table)
annot = subset(as.data.frame(v$genes), 
               select=c(gencodeGeneID, ensemblID, Symbol, newGeneID, newGeneSymbol))
annot["JunctionID"] = row.names(annot)
new_annot = as.data.table(annot)
jxn_file = '/ceph/projects/v3_phase3_paper/analysis/twas/_m/junctions/expr_pos.txt'
jxn_pos = fread(jxn_file, data.table=T)
new_annot = merge(jxn_pos, new_annot, by.x="name", by.y='JunctionID')

In [7]:
new_annot.rename(columns={'name': 'JunctionID', 
                          'Chr': 'seqname', 
                          'newGeneID': 'gene_id', 
                          'newGeneSymbol': 'gene_name'}, 
                 inplace=True)

gtf_annot = new_annot[(new_annot['seqname'].str.contains('chr\d+'))]
gtf_annot.seqname.unique()

array(['chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16',
       'chr17', 'chr18', 'chr19', 'chr1', 'chr20', 'chr21', 'chr22',
       'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9'],
      dtype=object)

## Random forest analysis

In [8]:
feature = 'junctions'
directory = feature + '/'
mkdir_p(directory)

### All genes

In [None]:
outdir = directory + 'all_chrom/'
mkdir_p(outdir)

In [None]:
X = expr.T
y = model.Male.astype('category').cat.codes
null_model = model.drop(['Male'], axis=1)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)
skf.get_n_splits(X, y)

fields = ['n_features_all_features', 'train_oob_score_nmi_all_features',
          'train_oob_score_accuracy_all_features', 
          'train_oob_score_roc_all_features', 'n_features',
          'train_oob_score_accuracy', 'train_oob_score_nmi', 
          'train_oob_score_roc', 'test_score_accuracy', 'test_score_nmi', 
          'test_score_roc', 'test_score_balanced_accuracy']

fold = 0
with open(outdir+'raffe_genes_10folds.txt', 'w') as f:
    print("\t".join(['fold'] + fields), file=f, flush=True)
    for train_index, test_index in skf.split(X, y):
        #print("TRAIN:", train_index, "\nTEST:", test_index, "\nFold:", fold)
        X_train, X_test = residualize(X, train_index, test_index, null_model, weights)
        y_train, y_test = y[train_index], y[test_index]
        o = run_raffe(X_train, X_test, y_train, y_test, fold, outdir, new_annot)
        print("\t".join([str(fold)] + [str(o[x]) for x in fields]), 
              flush=True, file=f)
        fold += 1


### Autosomal only

In [9]:
outdir = directory + 'autosomal_chrom/'
mkdir_p(outdir)

In [10]:
expr = expr.merge(gtf_annot[['JunctionID']], left_index=True, 
                  right_on='JunctionID').set_index('JunctionID')
weights = weights.merge(gtf_annot[['JunctionID']], left_index=True, 
                        right_on='JunctionID').set_index('JunctionID')

In [23]:
X = expr.T
y = model.Male.astype('category').cat.codes
null_model = model.drop(['Male'], axis=1)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)
skf.get_n_splits(X, y)

fields = ['n_features_all_features', 'train_oob_score_nmi_all_features',
          'train_oob_score_accuracy_all_features', 
          'train_oob_score_roc_all_features', 'n_features',
          'train_oob_score_accuracy', 'train_oob_score_nmi', 
          'train_oob_score_roc', 'test_score_accuracy', 'test_score_nmi', 
          'test_score_roc', 'test_score_balanced_accuracy']

fold = 0
with open(outdir+'raffe_genes_10folds.txt', 'w') as f:
    print("\t".join(['fold'] + fields), file=f, flush=True)
    for train_index, test_index in skf.split(X, y):
        #print("TRAIN:", train_index, "\nTEST:", test_index, "\nFold:", fold)
        X_train, X_test = residualize(X, train_index, test_index, null_model, weights)
        y_train, y_test = y[train_index], y[test_index]
        o = run_raffe(X_train, X_test, y_train, y_test, fold, outdir, gtf_annot)
        print("\t".join([str(fold)] + [str(o[x]) for x in fields]), 
              flush=True, file=f)
        fold += 1