# Model ensembling
- 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss

In [3]:
data_raw = pd.read_csv('../data/curated/combined_data_div_binned.csv')
data_reduced = pd.read_csv('../data/curated/combined_data_div_binned_reduced.csv')
data_average = pd.read_csv('../data/curated/combined_data_average.csv')
label = pd.read_csv('../data/curated/combined_label.csv')
significant_features = pd.read_csv('../data/curated/significant_features.csv')

In [4]:
significant_features = significant_features['0'].tolist()

In [5]:
# combine data_raw with data_average
data_raw = pd.concat([data_raw, data_average], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data_raw[significant_features], label.iloc[:, 0:2], test_size=0.2, random_state=42)


### 1.投票集成

In [7]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier


In [8]:
model1 = MultiOutputClassifier(LogisticRegression())
model2 = MultiOutputClassifier(DecisionTreeClassifier())
model3 = MultiOutputClassifier(SVC(probability=True))

# 创建基础模型
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = SVC(probability=True)

# 创建投票集成模型
voting_ensemble = VotingClassifier(estimators=[
    ('lr', model1), ('dt', model2), ('svc', model3)],
    voting='soft')  # 使用'soft'投票

# 使用 MultiOutputClassifier 包装投票集成模型
multioutput_ensemble = MultiOutputClassifier(voting_ensemble)

from sklearn.utils.class_weight import compute_sample_weight

# 计算样本权重，为异常值赋予更高的权重
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# 在训练模型时使用样本权重
multioutput_ensemble.fit(X_train, y_train, sample_weight=sample_weights)

# 训练模型
multioutput_ensemble.fit(X_train, y_train)
predictions = multioutput_ensemble.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
hamming = hamming_loss(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Hamming Loss: {hamming}')


Accuracy: 0.5504702194357367
Precision: 0.6144578313253012
Recall: 0.5198776758409785
F1 Score: 0.5632247377139702
Hamming Loss: 0.24796238244514107


### 2.Random Forest

In [10]:
# 创建一个random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_sample_weight

# 计算样本权重，为异常值赋予更高的权重
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# 创建一个random forest model
rf_model = RandomForestClassifier()

# 使用 MultiOutputClassifier 包装 random forest model
multioutput_rf_model = MultiOutputClassifier(rf_model)

# 在训练模型时使用样本权重
multioutput_rf_model.fit(X_train, y_train, sample_weight=sample_weights)

# 训练模型
multioutput_rf_model.fit(X_train, y_train)
predictions = multioutput_rf_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
hamming = hamming_loss(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Hamming Loss: {hamming}')

Accuracy: 0.6025078369905956
Precision: 0.7155555555555555
Recall: 0.4923547400611621
F1 Score: 0.5833333333333334
Hamming Loss: 0.21630094043887146
