In [10]:
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

data = pandas.read_csv("phase2_827.csv")

X = data.drop(columns=["biome"]).fillna(0)
y = data['biome'].map({"fermitation":0,"gut":1,"lake":2,"soil":3}).fillna(0).astype(int)

# 各个类别的数量
print(y.value_counts())

# 数据分割，80%训练集，20%测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 随机森林分类
clf = RandomForestClassifier(n_estimators=10, max_depth=1, random_state=0,  class_weight={1:1, 0:2})
clf.fit(X_train, y_train)
acc = sum(y_test == clf.predict(X_test)) / len(y_test)
print("Acc:", acc)



# 使用grid search找出最优参数组合

param_grid = {
    'n_estimators': [i*10 for i in range(1,20)],
    'criterion':['entropy', 'entropy'],
    'max_depth': [i for i in range(1, 10)],
    'max_features': ["auto", "log2"],
    'class_weight':[{1:1, 0:i} for i in range(1,5)]
}
clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=clf, 
                           param_grid=param_grid,
                           cv = 3,
                           n_jobs = -1,
                           verbose = 2)

grid_search.fit(X, y)
print(grid_search.best_params_)

# 最优性能
print(grid_search.best_score_)

3    279
1     44
0     36
2     33
Name: biome, dtype: int64
Acc: 0.6075949367088608
Fitting 3 folds for each of 2736 candidates, totalling 8208 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 706 tasks      | elapsed:   36.5s
[Parallel(n_jobs=-1)]: Done 1272 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2002 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2892 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3946 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 5160 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 6538 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 8076 tasks      | elapsed:  7.0min


{'class_weight': {1: 1, 0: 4}, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 10}
0.770346447445684


[Parallel(n_jobs=-1)]: Done 8208 out of 8208 | elapsed:  7.1min finished


In [7]:
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

data = pandas.read_csv("output1213.csv")

X = data.drop(columns=["biome"]).fillna(0)
y = data['biome'].map({"fermitation":0,"gut":1,"lake":2,"soil":3}).fillna(0).astype(int)

# 各个类别的数量
print(y.value_counts())

# 数据分割，80%训练集，20%测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 随机森林分类
clf = RandomForestClassifier(n_estimators=10, max_depth=1, random_state=0,  class_weight={1:1, 0:2})
clf.fit(X_train, y_train)
acc = sum(y_test == clf.predict(X_test)) / len(y_test)
print("Acc:", acc)



# 使用grid search找出最优参数组合

param_grid = {
    'n_estimators': [i*10 for i in range(1,20)],
    'criterion':['entropy', 'entropy'],
    'max_depth': [i for i in range(1, 10)],
    'max_features': ["auto", "log2"],
    'class_weight':[{1:1, 0:i} for i in range(1,5)]
}
clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=clf, 
                           param_grid=param_grid,
                           cv = 3,
                           n_jobs = -1,
                           verbose = 2)

grid_search.fit(X, y)
print(grid_search.best_params_)

# 最优性能
print(grid_search.best_score_)
clf = RandomForestClassifier(n_estimators=10, max_depth=9,criterion="entropy",
    max_features="auto", random_state=0)
clf.fit(X, y)

3    505
0     69
2     60
1     59
Name: biome, dtype: int64
Acc: 0.6906474820143885
Fitting 3 folds for each of 2736 candidates, totalling 8208 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 706 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 1272 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 2002 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2892 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3946 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 5160 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 6538 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 8076 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 8208 out of 8208 | elapsed:  6.7min finished


{'class_weight': {1: 1, 0: 2}, 'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 80}
0.7619047619047619


RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=10,
                       random_state=0)

In [15]:
clf = RandomForestClassifier(n_estimators=10, max_depth=9,criterion="entropy",
    max_features="auto", random_state=0)
clf.fit(X, y)

RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=10,
                       random_state=0)

In [24]:
from sklearn import datasets
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing,metrics
from sklearn.preprocessing import label_binarize
import numpy as np
from scipy import interp

data2=pandas.read_csv("output_knowlevel4.csv")
y_train = data2["pfam"]
X_test = data2.drop(columns=["pfam"]).fillna(0)
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_train = label_binarize(y_train, classes=[0, 1, 2, 3])
X_test = X_test / X_test.sum()
y_score = clf.predict(X_test,y_train)

TypeError: predict() takes 2 positional arguments but 3 were given

In [18]:
print(y_score)

[0 3 0 0 0 1 0 0 0 0 0 0 3 1 3 3 1 3 3 3 3 3 3 0 1 3 1 1 1 1 1 0 1 3 3 1 3
 1 3 3 1 3 3 3 3 3 2 2 3 2 3 3 2 2 3 2 2 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [1]:
from sklearn import datasets
import pandas
import sklearn
import numpy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing,metrics
from sklearn.preprocessing import label_binarize
import numpy as np
from scipy import interp

data = pandas.read_csv("output1213.csv")

X = data.drop(columns=["biome"]).fillna(0)
y = data['biome'].map({"fermitation":0,"gut":1,"lake":2,"soil":3}).fillna(0).astype(int)

# 各个类别的数量
print(y.value_counts())

# 数据分割，80%训练集，20%测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 随机森林分类
clf = RandomForestClassifier(n_estimators=10, max_depth=9,criterion="entropy",
    max_features="auto", random_state=0)
clf.fit(X_train, y_train)
acc = sum(y_test == clf.predict(X_test)) / len(y_test)
print("Acc:", acc)

data2=pandas.read_csv("know_model.csv")
y_vali = data2["pfam"]
x_vali = data2.drop(columns=["pfam"]).fillna(0)
le = preprocessing.LabelEncoder()
y_vali = le.fit_transform(y_vali)
y_vali = label_binarize(y_vali, classes=[0, 1, 2, 3])
x_vali = x_vali / x_vali.sum()
a=clf.predict_proba(x_vali)
numpy.savetxt('test.txt',a)

3    505
0     69
2     60
1     59
Name: biome, dtype: int64
Acc: 0.7553956834532374


ValueError: Number of features of the model must match the input. Model n_features is 145 and input n_features is 199 