In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn import metrics
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('./train/train.csv')
df_test = pd.read_csv('./test/test.csv')

In [3]:
X_train = np.asarray(df_train.drop(['ID', 'label'], axis = 1))
y_train = np.asarray(df_train['label'].map({'围网':0, '拖网':1,'刺网':2}))

In [4]:
scaler = StandardScaler()

In [5]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [6]:
X_train = scaler.transform(X_train)

In [7]:
X_test = scaler.transform(np.asarray(df_test.drop('ID',axis = 1)))

In [8]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [10]:
clf = LGBMClassifier(**params)

In [11]:
models = []
pred = np.zeros((len(X_test), 5))
for index, (train_idx, val_idx) in enumerate(fold.split(X_train, y_train)):
    model = clf.fit(X_train[train_idx], y_train[train_idx], 
                    eval_set = [(X_train[val_idx], y_train[val_idx])], early_stopping_rounds = 500, verbose = False)
    models.append(model)
    val_pred = model.predict(X_train[val_idx])
    val_y = y_train[val_idx]
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    pred[:, index] = model.predict(X_test)

0 val f1 0.8547892826764252
1 val f1 0.8705988989683403
2 val f1 0.8383105097684931
3 val f1 0.8284900059340705
4 val f1 0.8623271936879967


In [13]:
pred = [int(stats.mode(num)[0][0]) for num in pred]

In [16]:
df_test['label'] = pred

In [17]:
df_test['label'] = df_test['label'].map({0:'围网',1:'拖网',2:'刺网'})

In [18]:
res = df_test[['ID', 'label']]

In [19]:
res.to_csv('./result/result.csv', index = None, header = None, encoding = 'utf-8')