In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Classification and identification model

### Data reading and pre-processing

In [None]:
data = pd.read_excel(r"EDCs Classification.xlsx",index_col = 0)
data.isnull().sum()
data.dropna(axis=1,inplace=True)
print(data.shape)

X = data.iloc[:,1:]
Y = data.iloc[:,0]

selector = VarianceThreshold(threshold=0) 
X1 = selector.fit_transform(X)
all_name = X.columns.values.tolist()  
select_name_index0 = selector.get_support(indices=True)  
select_name0 = []
for i in select_name_index0:
    select_name0.append(all_name[i])
X2 = pd.DataFrame(X1)
X2.columns = select_name0
X = X2

mic = mutual_info_regression(X2, Y, random_state=0)
k = mic.shape[0] - sum(mic <= 0)   
skb = SelectKBest(mutual_info_regression, k=k)  
X3 = skb.fit_transform(X2, Y)
pd.DataFrame(X3)
select_name_index = skb.get_support(indices=True)
select_name = []
for i in select_name_index:
    select_name.append(select_name0[i])
X4 = pd.DataFrame(X3)
X4.columns = select_name
X= X4
print(X.shape)

### Feature selection

In [None]:
score = []                                                            
for i in range(1, 420, 10):
    X_wrapper = RFE(RFC_, n_features_to_select=i, step=1).fit_transform(X, Y)   
    once = cross_val_score(RFC_, X_wrapper, Y, cv=10).mean()                      
    score.append(once)                                                          
print(max(score), (score.index(max(score)))*10+1)                                 
print(score)
plt.figure(figsize=[30, 15])
plt.plot(range(1, 420, 10), score)
plt.xticks(range(1, 420, 10))
plt.show()

###### Figure S1

In [None]:
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline
from matplotlib import rcParams
rcParams['font.family'] = 'serif'
rcParams['font.serif'] = ['Times New Roman']
rcParams['font.size'] = 11

x = range(1, 420, 10)
y = [0.69868421, 0.78787594, 0.78966165, 0.79323308, 0.79862155, 0.8039787 , 0.8039787 , 0.80219298, 0.79329574, 0.7914787 ,0.7914787 , 0.79859023, 0.7950188 , 0.7914787 , 0.79326441,0.79505013, 0.7914787 , 0.79859023, 0.79683584, 0.79326441,0.79505013, 0.79862155, 0.7985589 , 0.79505013, 0.79680451,0.79680451, 0.79326441, 0.80034461, 0.79859023, 0.79859023,0.79680451, 0.7985589 , 0.80034461, 0.79859023, 0.79680451,0.79680451, 0.79677318, 0.79859023, 0.7985589 , 0.80213033,0.7950188 , 0.7985589 ]

x_smooth = np.linspace(min(x), max(x), len(x)*10)
y_smooth = make_interp_spline(x,y)(x_smooth)

plt.plot(x_smooth,y_smooth,color='#14a4c8')
max_y = max(y)
max_x = x[y.index(max_y)]
plt.scatter(max_x,max_y,s=50,c='orange')
plt.axvline(x=max_x,ymin=0,ymax=(max_y-min(y))/(max(y)-min(y)),color='#00FF7F',linestyle='dashed',alpha=0.3)
plt.text(max_x,-max(y)/10,str(max_x),horizontalalignment='center')
plt.gca().set_facecolor('#FFFFFF')
plt.savefig('data_visualization.jpeg',dpi=600)
plt.show()

### Model building

In [None]:
RFC_ = RFC(n_estimators=1000,random_state = 420)          
selector = RFE(RFC_, n_features_to_select=51, step=1).fit(X, Y)       
print(selector.support_.sum())
X = selector.transform(X) 
X = pd.DataFrame(X)

all_name_1 = X4.columns.values.tolist()  
select_name_index_1 = selector.get_support(indices=True)  
select_name_1 = []
for i in select_name_index_1:
    select_name_1.append(all_name_1[i])
X.columns = select_name_1
print(X)

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X, Y, test_size=0.3, random_state=3837)
smo = SMOTE(random_state=420)
X_smo, Y_smo = smo.fit_resample(Xtrain, Ytrain)
RF = RFC(n_estimators=1000, random_state=420, n_jobs=-1)
RF.fit(X_smo, Y_smo)
score_train = RF.score(Xtrain, Ytrain)

### Model results

In [None]:
score_test = RF.score(Xtest, Ytest)

feat_labels = X.columns.values.tolist()
importances = RF.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):   
    print("%2d) %-*s %f" % \
          (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# calculate the fpr and tpr for all thresholds of the classification
probs = RF.predict_proba(Xtest)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(Ytest, preds)
roc_auc = auc(fpr, tpr)

# plot the ROC curve
#plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('ROC.jpeg', dpi=600)
plt.show()

# Classification model of endocrine-disrupting effects of EDCs

In [None]:
Hormone_types = pd.read_excel(r"Hormone_types.xlsx",index_col = 0, sheet_name=1)
X = Hormone_types.iloc[:, 1:]
y = Hormone_types.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1587, stratify=y)
classifier = RFC(n_estimators=1000,random_state=6336)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
result2 = accuracy_score(y_test, y_pred)
print(result2)

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2, 3, 4])

auc = [roc_auc_score(y_test_bin[:, i], y_pred_bin[:, i]) for i in range(y_test_bin.shape[1])]

avg_auc = sum(auc) / len(auc)
print('Average AUC:', avg_auc)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# 计算并绘制每个类别的ROC曲线
plt.figure()
for i in range(y_test_bin.shape[1]):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    roc_auc = auc(fpr, tpr)
    classes = ['Other', 'Estrogen', 'Androgen', 'Progesterone', 'Thyroid']
    plt.plot(fpr, tpr, label='ROC curve of {0} (area = {1:0.2f})'.format(classes[i], roc_auc))

# 绘制随机分类器的ROC曲线
plt.plot([0, 1], [0, 1], 'k--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.rcParams['font.family'] = 'Times New Roman'
plt.savefig('FIG_41A.jpeg', dpi=800)
plt.show()