In [17]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/train.csv', encoding='utf8')
test = pd.read_csv('data/test.csv')

In [18]:
# 피처 엔지니어링
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
keyword_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
location_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# hash_url = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# train 엔지니어링
train_keyword_ohe = keyword_ohe.fit_transform(train[['keyword']].fillna("unknown"))
train_location_ohe = location_ohe.fit_transform(train[['location']].fillna("unknown"))
train['has_url'] = train['text'].apply(lambda x: int('http' in x))
train['char_count'] = train['text'].apply(len)
train['word_count'] = train['text'].apply(lambda x: len(x.split()))
train_feats = train[['char_count', 'word_count', 'has_url']].values


# 테스트 엔지니어링
test_keyword_ohe = keyword_ohe.transform(test[['keyword']].fillna("unknown"))
test_ohe_location = location_ohe.transform(test[['location']].fillna("unknown"))
# test_hash_url_ohe = hash_url.transform(test[['text']].fillna("unknown"))
test['char_count'] = test['text'].apply(len)
test['word_count'] = test['text'].apply(lambda x: len(x.split()))
test['has_url'] = test['text'].apply(lambda x: int('http' in x))
test_feats = test[['char_count', 'word_count', 'has_url']].values


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader as api
import numpy as np



X = train['text']     # 텍스트 컬럼
y = train['target']   # 정답 (0 또는 1)
# Word2Vec 모델 다시 로딩 및 텍스트 벡터화 재실행

# Word2Vec 사전학습 모델 로딩
word2vec_model = api.load("word2vec-google-news-300")


# 평균 벡터 임베딩 함수
def get_average_word2vec(text, model, vector_size=300):
    words = text.split()
    valid_words = [w for w in words if w in model]
    if not valid_words:
        return np.zeros(vector_size)
    return np.mean([model[w] for w in valid_words], axis=0)

# 텍스트 벡터화
X_vec = np.vstack([get_average_word2vec(text, word2vec_model) for text in X])


# X_feats = train['location'].values
# print(train.head(50), '@#@#@#123123')
# X_combined = np.hstack([X_vec, X_feats, location_ohe])
X_combined = np.hstack([X_vec, train_location_ohe, train_keyword_ohe])
# X_combined = np.hstack([X_vec])

# 학습/검증 분할 및 VotingClassifier 재실행
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# 실행 가능한 모델로 Voting 구성
lr_model = LogisticRegression(max_iter=1000)
svc_model = SVC(probability=True)
rf_model = RandomForestClassifier(n_estimators=100)

estimators = [
    ('lr', lr_model),
    ('svc', svc_model),
    ('rf', rf_model)
]
voting_model = VotingClassifier(estimators=estimators, voting='soft')
voting_model.fit(X_train, y_train)
y_pred = voting_model.predict(X_val)
f1 = f1_score(y_val, y_pred)
voting_model.fit(X_combined, y)

# ====== ROUND1 ===========
# Stacking 앙상블 기법사용시.
# 📊 F1 Score: 0.7044
# ====== ROUND2 ===========
# VotingClassifier 앙상블 기법사용시.
# 📊 F1 Score: 0.6951
# ====== ROUND3 ===========
# VotingClassifier + word2vec_model Encoding
# 📊 F1 Score: 0.7556675062972292

# ====== ROUND4 ===========
# VotingClassifier + word2vec_model Encoding + OneHotEncoding Labeling
# 📊 F1 Score: 0.7740345110928513

# ====== ROUND5 ===========
# VotingClassifier + word2vec_model Encoding + OneHotEncoding Labeling(keyword, location)
# 📊 F1 Score: 0.7732463295269169

# ====== ROUND6 ===========
# VotingClassifier + word2vec_model Encoding + OneHotEncoding Labeling(keyword, location, has_url, char_count, word_count)
# 📊 F1 Score: 0.7529411764705882

print(f1)

0.7679738562091504


In [20]:
import pandas as pd
import numpy as np

# 테스트 데이터 불러오기
test_df = pd.read_csv("data/test.csv")
y_test = test_df['text']


# 테스트 데이터를 벡터로 변환
y_test_vec = np.vstack([get_average_word2vec(text, word2vec_model) for text in y_test])

y_combined = np.hstack([y_test_vec, test_ohe_location, test_keyword_ohe])
# y_combined = np.hstack([y_test_vec])

# 예측
test_preds = voting_model.predict(y_combined)

# 제출 파일 생성
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': test_preds
})

submission.to_csv("submission.csv", index=False)
