In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data_train = pd.read_csv('Dataset/diabetic_preprocessed_train.csv')
data_test = pd.read_csv('Dataset/diabetic_preprocessed_test.csv')

# Define features and target variable
features = ['gender', 'age', 'admission_source_id', 'time_in_hospital', 'medical_specialty',
            'num_lab_procedures', 'num_procedures', 'num_medications', 'primary_diagnosis',
            'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'insulin', 'change',
            'diabetesMed', 'medicare', 'medicaid', 'had_emergency', 'had_inpatient_days',
            'had_outpatient_days']
target = 'readmit_30_days'

# Convert categorical features to one-hot encoding
categorical_features = ['gender', 'age', 'admission_source_id', 'medical_specialty',
                        'primary_diagnosis', 'max_glu_serum', 'A1Cresult', 'insulin', 'change',
                        'diabetesMed', 'medicare', 'medicaid', 'had_emergency', 'had_inpatient_days',
                        'had_outpatient_days']

encoded_data_train = pd.get_dummies(data_train[features], columns=categorical_features, drop_first=True)
encoded_data_test = pd.get_dummies(data_test[features], columns=categorical_features, drop_first=True)

# X_train,Y_train,X_test,Y_test
X_train, Y_train = encoded_data_train, data_train[target]
X_test, Y_test = encoded_data_test, data_test[target]

# 合并数值型变量到 X_train
X_train['time_in_hospital'] = data_train['time_in_hospital']
X_train['num_lab_procedures'] = data_train['num_lab_procedures']
X_train['num_procedures'] = data_train['num_procedures']
X_train['num_medications'] = data_train['num_medications']
X_train['number_diagnoses'] = data_train['number_diagnoses']

# 合并数值型变量到 X_test
X_test['time_in_hospital'] = data_test['time_in_hospital']
X_test['num_lab_procedures'] = data_test['num_lab_procedures']
X_test['num_procedures'] = data_test['num_procedures']
X_test['num_medications'] = data_test['num_medications']
X_test['number_diagnoses'] = data_test['number_diagnoses']

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=7)
rf_classifier.fit(X_train, Y_train)


# Make predictions on the test set
train_pred = rf_classifier.predict(X_train)
test_pred = rf_classifier.predict(X_test)

# 添加预测结果到测试数据集中
data_test['readmit_30_days_pred'] = test_pred

# 将包含预测结果的测试数据集保存到文件
data_test.to_csv('Dataset/diabetic_preprocessed_test.csv', index=False)


# Evaluate model performance
a = accuracy_score(Y_train, train_pred)
accuracy = accuracy_score(Y_test, test_pred)
print(f'Accuracy in train data: {a}')
print(f'Accuracy in test data: {accuracy}')

Accuracy in train data: 0.9999370990061643
Accuracy in test data: 0.6464888661779447
