In [10]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/train.csv', encoding='utf8')
test = pd.read_csv('data/test.csv')

In [11]:
# 피처 엔지니어링
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

location_ohe = ohe.fit_transform(train[['keyword']].fillna("unknown"))
# train['keyword'] = pd.to_numeric(train['keyword'], errors='coerce')
print(location_ohe, '2323')
train['location'] = le.fit_transform(train['location'].fillna("unknown"))
# train['location'] = pd.to_numeric(train['location'], errors='coerce')
print(train.head(50))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 2323
    id keyword  location                                               text  \
0    1     NaN      3268  Our Deeds are the Reason of this #earthquake M...   
1    4     NaN      3268             Forest fire near La Ronge Sask. Canada   
2    5     NaN      3268  All residents asked to 'shelter in place' are ...   
3    6     NaN      3268  13,000 people receive #wildfires evacuation or...   
4    7     NaN      3268  Just got sent this photo from Ruby #Alaska as ...   
5    8     NaN      3268  #RockyFire Update => California Hwy. 20 closed...   
6   10     NaN      3268  #flood #disaster Heavy rain causes flash flood...   
7   13     NaN      3268  I'm on top of the hill and I can see a fire in...   
8   14     NaN      3268  There's an emergency evacuation happening now ...   
9   15     NaN      3268  I'm afraid that the tor

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader as api
import numpy as np



X = train['text']     # 텍스트 컬럼
y = train['target']   # 정답 (0 또는 1)
# Word2Vec 모델 다시 로딩 및 텍스트 벡터화 재실행

# Word2Vec 사전학습 모델 로딩
word2vec_model = api.load("word2vec-google-news-300")



# 평균 벡터 임베딩 함수
def get_average_word2vec(text, model, vector_size=300):
    words = text.split()
    valid_words = [w for w in words if w in model]
    if not valid_words:
        return np.zeros(vector_size)
    return np.mean([model[w] for w in valid_words], axis=0)

# 텍스트 벡터화
X_vec = np.vstack([get_average_word2vec(text, word2vec_model) for text in X])


# X_feats = train['location'].values
# print(train.head(50), '@#@#@#123123')
# X_combined = np.hstack([X_vec, X_feats, location_ohe])
X_combined = np.hstack([X_vec, location_ohe])

# 학습/검증 분할 및 VotingClassifier 재실행
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# 실행 가능한 모델로 Voting 구성
lr_model = LogisticRegression(max_iter=1000)
svc_model = SVC(probability=True)
rf_model = RandomForestClassifier(n_estimators=100)

estimators = [
    ('lr', lr_model),
    ('svc', svc_model),
    ('rf', rf_model)
]
voting_model = VotingClassifier(estimators=estimators, voting='soft')
voting_model.fit(X_train, y_train)
y_pred = voting_model.predict(X_val)
f1 = f1_score(y_val, y_pred)

# ====== ROUND1 ===========
# Stacking 앙상블 기법사용시.
# 📊 F1 Score: 0.7044
# ====== ROUND2 ===========
# VotingClassifier 앙상블 기법사용시.
# 📊 F1 Score: 0.6951
# ====== ROUND3 ===========
# VotingClassifier + word2vec_model Encoding
# 📊 F1 Score: 0.7556675062972292

# ====== ROUND4 ===========
# VotingClassifier + word2vec_model Encoding + OneHotEncoding Labeling
# 📊 F1 Score: 0.7740345110928513

print(f1)

0.7740345110928513


In [15]:
import pandas as pd
import numpy as np

# 테스트 데이터 불러오기
test_df = pd.read_csv("data/test.csv")
X_test = test_df['text'].fillna("")

# Word2Vec 평균 벡터 함수 (이미 로드된 모델 기준)
def get_average_word2vec(text, model, vector_size=300):
    words = text.split()
    valid_words = [w for w in words if w in model]
    if not valid_words:
        return np.zeros(vector_size)
    return np.mean([model[w] for w in valid_words], axis=0)

# 테스트 데이터를 벡터로 변환
X_test_vec = np.vstack([get_average_word2vec(text, word2vec_model) for text in X_test])

location_ohe = ohe.fit_transform(test[['keyword']].fillna("unknown"))
X_combined = np.hstack([X_test_vec, location_ohe])

# 예측
test_preds = voting_model.predict(X_combined)

# 제출 파일 생성
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': test_preds
})

submission.to_csv("submission.csv", index=False)
