In [14]:
import pandas as pd
import numpy as np


train = pd.read_csv('data/train.csv', encoding='utf8')
test = pd.read_csv('data/test.csv')

In [None]:
HEADER = train.columns
for header in HEADER:

    print(train[header].isnull().sum())

text_list = train['text'].to_list()
train['keyword'].value_counts()
print(train.columns)
print(test.columns)

0
61
2533
0
0
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer



X = train['text']     # 텍스트 컬럼
y = train['target']   # 정답 (0 또는 1)

vectorizer = TfidfVectorizer(stop_words='english', max_features=500)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train)

# model = LogisticRegression()

cat_model = CatBoostClassifier(iterations=500, learning_rate=0.04, depth=5, verbose=0, early_stopping_rounds=50)
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.04, max_depth=3, eval_metric='auc')
svm_model = SVC(probability=True, kernel='rbf', C=1.0)
knn_model = KNeighborsClassifier(n_neighbors=5)
lgbm_model = LGBMClassifier(objective='binary', metric='auc')

estimators = [
    ('cat', cat_model),
    ('lgbm', lgbm_model),
    ('xgb', xgb_model)
]

# model = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier(n_estimators=100))
model = VotingClassifier(estimators=estimators, voting='soft')
# model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_val_tfidf)

f1 = f1_score(y_val, y_pred)

print("📊 F1 Score:", round(f1, 4))
# ====== ROUND1 ===========
# Stacking 앙상블 기법사용시.
# 📊 F1 Score: 0.7044
# ====== ROUND2 ===========
# VotingClassifier 앙상블 기법사용시.
# 📊 F1 Score: 0.6951


[LightGBM] [Info] Number of positive: 1972, number of negative: 2595
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4465
[LightGBM] [Info] Number of data points in the train set: 4567, number of used features: 363
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.431793 -> initscore=-0.274538
[LightGBM] [Info] Start training from score -0.274538
📊 F1 Score: 0.6971




In [79]:
test_df = pd.read_csv("data/test.csv")
X_test = test_df['text']
X_test_tfidf = vectorizer.transform(X_test)   # fit 말고 transform!

preds = model.predict(X_test_tfidf)

submission = pd.DataFrame({
    'id': test_df['id'],
    'target': preds
})

submission.to_csv("submission.csv", index=False)


