In [7]:
import pandas as pd
import csv
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
from copy import deepcopy
import numpy as np
from sklearn.cluster import KMeans
import networkx as nx
import operator
import os
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn import preprocessing
from sklearn import utils
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

##################################
# Data preparation
##################################

######### Read input files
def readInput(path):
    absolutePath = path
    data = pd.read_csv(os.path.join(absolutePath,"cleanFeatures.csv"))
    data.drop(['Unnamed: 0'], axis=1,inplace = True)
    return data


######### Split train test set
def split(data):
    X = data.drop(['Target'],axis=1)
    y = data['Target']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=2019)
    return (X_train, X_test, y_train, y_test)

def RandomForest(X_train_RF,X_test_RF):
#     ######## Prepare output
#     text_file = open("RandomForestOutput.txt", "w")
    #
    ######### Create model
    rf = RandomForestClassifier()
    #
    ######### Search for the best parameter
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(5, 105, num = 10)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    rf_random_f1 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                            scoring = "f1", n_iter = 100, cv = 3, verbose=2,
                            random_state=42, n_jobs = -1)
    rf_random_acc = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                            scoring = "accuracy", n_iter = 100, cv = 3, verbose=2,
                            random_state=42, n_jobs = -1)
    rf_random_f1.fit(X_train_RF, y_train)
    rf_random_acc.fit(X_train_RF, y_train)
    #
    ######### Evaluate Result
    print("The best f1 score for randomizedsearchcv is " + str(rf_random_f1.best_score_))
    print("The best acc score for randomizedsearchcv is " + str(rf_random_acc.best_score_))
    #
    f1_params = rf_random_f1.best_params_
    acc_params = rf_random_acc.best_params_
    #
    return f1_params, acc_params

In [8]:
path = "/Users/limengyang/Workspaces/Module-Detection/"
data = readInput(path)
X_train, X_test, y_train, y_test = split(data)
X_train_RF = X_train.drop(['ProteinID'],axis = 1)
X_test_RF = X_test.drop(['ProteinID'],axis = 1)
f1_params, acc_params = RandomForest(X_train_RF,X_test_RF)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, bootstrap=True 
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, bootstrap=True 
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, bootstrap=True 
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=True 
[CV]  n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, bootstrap=True, total=  14.9s
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=True 
[CV]  n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, bootstrap=True, total=  15.0s
[CV]  n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, bootstrap=Tr

[CV]  n_estimators=900, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=82, bootstrap=False, total=  39.4s
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=93, bootstrap=False 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=93, bootstrap=False, total=  39.4s
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=93, bootstrap=False 


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min


[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=93, bootstrap=False, total=  41.2s
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=93, bootstrap=False 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=93, bootstrap=False, total=  40.4s
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=16, bootstrap=True 
[CV]  n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=93, bootstrap=False, total=  42.3s
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=16, bootstrap=True 
[CV]  n_estimators=500, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=16, bootstrap=True, total=  12.3s
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=16, bootstrap=True 
[CV]  n_estimators=100

[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=71, bootstrap=True, total=  28.3s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False, total=   4.5s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False, total=   4.7s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False, total=   4.5s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=16, bootstrap=True 
[CV]  n_estimator

[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=71, bootstrap=False, total=  39.6s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=49, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=49, bootstrap=False, total=   4.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=49, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=49, bootstrap=False, total=   4.4s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=49, bootstrap=False, total=   4.2s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=300,

[CV]  n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=49, bootstrap=True, total=  13.9s
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=49, bootstrap=True 
[CV]  n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=49, bootstrap=True, total=  14.2s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=5, bootstrap=True 
[CV]  n_estimators=800, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=49, bootstrap=True, total=  20.7s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=5, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=5, bootstrap=True, total=   1.4s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=5, bootstrap=True 
[CV]  n_estimators=100, min_samples_

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 11.7min


[CV]  n_estimators=900, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=38, bootstrap=True, total=  24.9s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=False 
[CV]  n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=True, total=   8.2s
[CV] n_estimators=900, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=True 
[CV]  n_estimators=900, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=True, total=  12.0s
[CV] n_estimators=900, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=True 
[CV]  n_estimators=900, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=True, total=  12.0s
[CV] n_estimators=900, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=True 
[CV]  n_estimators=1000, min_samples

[CV]  n_estimators=600, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=105, bootstrap=True, total=  16.2s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=27, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=27, bootstrap=False, total=   7.9s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=27, bootstrap=False 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=True, total=  29.1s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=27, bootstrap=False 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=True, total=  28.5s
[CV] n_estimators=600, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=49, bootstrap=True 
[CV]  n_estimators=600, min

[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True, total=   2.8s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True 
[CV]  n_estimators=700, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=105, bootstrap=False, total=  28.7s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True, total=   2.9s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True, total=   2.8s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=5, bootstrap=False 
[CV]  n_estimators=700

KeyboardInterrupt: 