In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# load dataset
train_data = pd.read_csv('train_data.csv')
X = train_data.drop(columns=['Species'])
y = train_data['Species']

# One-Hot Encode
X = pd.get_dummies(X, drop_first=True)

# RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# get importance and print
importance = model.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))

                 Feature  Importance
3    Flipper Length (mm)    0.185727
1     Culmen Length (mm)    0.131595
6      Delta 13 C (o/oo)    0.125321
2      Culmen Depth (mm)    0.112716
4          Body Mass (g)    0.101891
5      Delta 15 N (o/oo)    0.079489
9           Island_Dream    0.060366
0          Sample Number    0.035883
10      Island_Torgersen    0.027444
184    Date Egg_11/27/07    0.009734


In [2]:
print(feature_importance.head(8))

               Feature  Importance
3  Flipper Length (mm)    0.185727
1   Culmen Length (mm)    0.131595
6    Delta 13 C (o/oo)    0.125321
2    Culmen Depth (mm)    0.112716
4        Body Mass (g)    0.101891
5    Delta 15 N (o/oo)    0.079489
9         Island_Dream    0.060366
0        Sample Number    0.035883


只取重要性最大的重要性最大的前5维
1. 他们的重要度都大于0.06
2. 后面的很多是one-shot数据类型，对于分类问题的准确度影响不大，但是会造成数据维度的上升，占用内存并影响训练速度

In [3]:
print(X.columns)

Index(['Sample Number', 'Culmen Length (mm)', 'Culmen Depth (mm)',
       'Flipper Length (mm)', 'Body Mass (g)', 'Delta 15 N (o/oo)',
       'Delta 13 C (o/oo)', 'studyName_PAL0809', 'studyName_PAL0910',
       'Island_Dream',
       ...
       'Date Egg_2011/5/8', 'Date Egg_2011/6/8', 'Date Egg_2011/8/8',
       'Date Egg_2011/9/7', 'Date Egg_2011/9/8', 'Date Egg_2011/9/9',
       'Date Egg_2012/1/9', 'Date Egg_2012/3/7', 'Sex_FEMALE', 'Sex_MALE'],
      dtype='object', length=208)


In [4]:
# select features
features = ['Flipper Length (mm)', 'Culmen Length (mm)', 'Delta 13 C (o/oo)', 
            'Culmen Depth (mm)', 'Body Mass (g)']
# features = ['Flipper Length (mm)', 'Culmen Length (mm)', 'Delta 13 C (o/oo)', 
#             'Culmen Depth (mm)', 'Body Mass (g)', 'Delta 15 N (o/oo)', 'Island_Dream']
# features = ['Flipper Length (mm)', 'Culmen Length (mm)', 'Delta 13 C (o/oo)', 
#             'Culmen Depth (mm)', 'Body Mass (g)']
# 对 'Island' 列进行 One-Hot 编码
# train_data_encoded = pd.get_dummies(train_data, columns=['Island'], drop_first=True)
X_train = train_data[features]
y_train = train_data['Species']

In [5]:
null_sum = X_train.isnull().sum()
print(null_sum)

if null_sum.sum()!= 0:
    mean_values = X_train.mean()
    X_train = X_train.fillna(mean_values)

Flipper Length (mm)    0
Culmen Length (mm)     0
Delta 13 C (o/oo)      0
Culmen Depth (mm)      0
Body Mass (g)          0
dtype: int64


In [6]:
# train
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# predict in test data
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

                                           precision    recall  f1-score   support

      Adelie Penguin (Pygoscelis adeliae)       1.00      1.00      1.00        21
Chinstrap penguin (Pygoscelis antarctica)       1.00      1.00      1.00        12
        Gentoo penguin (Pygoscelis papua)       1.00      1.00      1.00        13

                                 accuracy                           1.00        46
                                macro avg       1.00      1.00      1.00        46
                             weighted avg       1.00      1.00      1.00        46



In [7]:
# load test dataset
test_data = pd.read_csv('test_data.csv')
X_test = test_data[features].copy()
X_test.fillna(X_test.mean(), inplace=True)

# predict
test_predictions = model.predict(X_test)
print(test_predictions)

['Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Chinstrap penguin (Pygoscelis antarctica)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (Pygoscelis adeliae)'
 'Adelie Penguin (P

In [8]:
# write .txt file
output_file_name = "predictions.txt"

with open(output_file_name, 'w') as file:
    if len(test_predictions.shape) > 1:
        for prediction in test_predictions:
            prediction_str = " ".join(map(str, prediction))
            file.write(prediction_str + "\n")
    else:
        for prediction in test_predictions:
            file.write(str(prediction) + "\n")

print(f"预测结果已成功写入到 {output_file_name} 中。")

预测结果已成功写入到 predictions.txt 中。
