In [1]:
!pip install tokenization
!pip install wordcloud
!pip install category_encoders
!pip install vaderSentiment

Collecting tokenization
  Downloading tokenization-1.0.7-py3-none-any.whl (10 kB)
Installing collected packages: tokenization
Successfully installed tokenization-1.0.7
Collecting category_encoders
  Downloading category_encoders-2.4.0-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.1 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.0
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 4.2 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [1]:
# 데이터 NLP용 패키지
import gc
import re
import string
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# 데이터 전처리용 패키지
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm.notebook import tqdm
tqdm.pandas()

# 데이터 분석용 패키지
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 데이터 시각화
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  import pandas.util.testing as tm


In [2]:
# 데이터 호출
df_train = pd.read_csv("/content/drive/MyDrive/python file/boostcamp/project2/train.csv", dtype={'id': np.int16, 'target': np.int8})
df_test = pd.read_csv("/content/drive/MyDrive/python file/boostcamp/project2/test.csv", dtype={'id': np.int16})

print('Training Set Shape = {}'.format(df_train.shape))
print('Test Set Shape = {}'.format(df_test.shape))

Training Set Shape = (7613, 5)
Test Set Shape = (3263, 4)


In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
# 데이터 결측치 확인
df_train.isnull().sum()/len(df_train)

id          0.000000
keyword     0.008013
location    0.332720
text        0.000000
target      0.000000
dtype: float64

In [6]:
df_test.isnull().sum()/len(df_test)

id          0.000000
keyword     0.007968
location    0.338645
text        0.000000
dtype: float64

In [7]:
# 데이터 타겟 분포확인
df_train["target"].value_counts(normalize = True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [8]:
# 지역명 전처리
# geolocator = Nominatim(user_agent = "my-application")
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=3, max_retries=5)
# df_train["location ecode"] = df_train["location"].progress_apply(geocode, language = "en")
# df_train["coordinates"] = df_train["location ecode"].apply(lambda loc: tuple(loc.point) if loc else None)
# df_train["state"] = df_train["location ecode"].apply(lambda loc: loc[0].split(",")[0] if loc else None)
# df_train["country"] = df_train["location ecode"].apply(lambda loc: loc[0].split(",")[-1] if loc else None)

# df_test["location ecode"] = df_test["location"].progress_apply(geocode, language = "en")
# df_test["coordinates"] = df_test["location ecode"].apply(lambda loc: tuple(loc.point) if loc else None)
# df_test["state"] = df_test["location ecode"].apply(lambda loc: loc[0].split(",")[0] if loc else None)
# df_test["country"] = df_test["location ecode"].apply(lambda loc: loc[0].split(",")[-1] if loc else None)

# df_train.to_csv("/content/drive/MyDrive/python file/boostcamp/project2/train_encoding.csv")
# df_test.to_csv("/content/drive/MyDrive/python file/boostcamp/project2/test_encoding.csv")

In [9]:
# 데이터 호출
# df_train = pd.read_csv("/content/drive/MyDrive/python file/boostcamp/project2/train_encoding.csv", dtype={'id': np.int16, 'target': np.int8})
# df_test = pd.read_csv("/content/drive/MyDrive/python file/boostcamp/project2/test_encoding.csv", dtype={'id': np.int16})

# print('Training Set Shape = {}'.format(df_train.shape))
# print('Test Set Shape = {}'.format(df_test.shape))

In [10]:
# 문자열의 특수 기호등을 제거
def RemovePunct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    p = re.compile("[0-9]+")
    m = p.match(text)
    text = re.sub('[0-9]+', '', text)
    return text

# 숫자의 유무를 확인
def FindNumber(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    p = re.compile("[0-9]+")
    m = p.match(text)
    return int(bool(m))

# 위 함수를 적용하여 text데이터 전처리
df_train['text punct'] = df_train['text'].apply(lambda x: RemovePunct(x))
df_train['using number'] = df_train['text'].apply(lambda x: FindNumber(x))

df_test['text punct'] = df_test['text'].apply(lambda x: RemovePunct(x))
df_test['using number'] = df_test['text'].apply(lambda x: FindNumber(x))

df_train.head(10)

Unnamed: 0,id,keyword,location,text,target,text punct,using number
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,0
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,0
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,RockyFire Update California Hwy closed in bo...,0
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster Heavy rain causes flash floodin...,0
7,13,,,I'm on top of the hill and I can see a fire in...,1,Im on top of the hill and I can see a fire in ...,0
8,14,,,There's an emergency evacuation happening now ...,1,Theres an emergency evacuation happening now i...,0
9,15,,,I'm afraid that the tornado is coming to our a...,1,Im afraid that the tornado is coming to our area,0


In [11]:
# 감정분석
analyser = SentimentIntensityAnalyzer()
df_train["sentiment"] = df_train["text punct"].apply(lambda x : analyser.polarity_scores(x)["compound"])
df_test["sentiment"] = df_test["text punct"].apply(lambda x : analyser.polarity_scores(x)["compound"])

df_train.head(10)

Unnamed: 0,id,keyword,location,text,target,text punct,using number,sentiment
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,0,0.2732
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,0,-0.34
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,0,-0.296
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...,1,0.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,0,0.0
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,RockyFire Update California Hwy closed in bo...,0,-0.34
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster Heavy rain causes flash floodin...,0,-0.6249
7,13,,,I'm on top of the hill and I can see a fire in...,1,Im on top of the hill and I can see a fire in ...,0,-0.1531
8,14,,,There's an emergency evacuation happening now ...,1,Theres an emergency evacuation happening now i...,0,-0.3818
9,15,,,I'm afraid that the tornado is coming to our a...,1,Im afraid that the tornado is coming to our area,0,0.0


# Count vector(bag of word)방법 사용

In [12]:
# 정제된 text 데이터 벡터화
stopword = nltk.corpus.stopwords.words('english')

count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df_train["text punct"])
test_vectors = count_vectorizer.transform(df_test["text punct"])

In [13]:
# 벡터화를 시킴으로써 특성이 너무 많이 생김으로 200개의 특성만 선별
target = "target"

skb = SelectKBest(chi2, k = 200)
train_vectors = skb.fit_transform(train_vectors, df_train[target])
test_vectors = skb.transform(test_vectors)

In [14]:
# 독립변수와 종속변수의 분류
X_train = df_train.drop(columns = [target, "id", "text", "text punct"]).merge(pd.DataFrame(train_vectors.toarray(), columns = skb.get_feature_names_out(count_vectorizer.get_feature_names_out())), left_index = True, right_index = True)
y_train = df_train[target]
X_test = df_test.drop(columns = ["id", "text", "text punct"]).merge(pd.DataFrame(test_vectors.toarray(), columns = skb.get_feature_names_out(count_vectorizer.get_feature_names_out())), left_index = True, right_index = True)

In [15]:
# 특성 전처리
pipe_pre =  make_pipeline(TargetEncoder(), SimpleImputer())

X_train_e = pipe_pre.fit_transform(X_train, y_train)
X_test_e = pipe_pre.transform(X_test)

In [18]:
# XGB 분류기를 사용하여 학습
clf = XGBClassifier()

search_space = [
    {"n_estimators": range(100, 500, 100),
     "max_depth": range(1, 20, 5),
     "learning_rate": np.arange(0.01, 0.05, 0.01),
     "random_state": [29],
     "tree_method": ["gpu_hist"]}  # For using the GPU.
]

# CV = 5
kfold = StratifiedKFold(n_splits=5, random_state=29, shuffle=True)

grid = GridSearchCV(estimator = clf, 
                    param_grid = search_space,
                    cv = kfold,
                    scoring = "f1",  # Accuracy
                    return_train_score = True,
                    n_jobs = -1,
                    verbose = 2,
                    refit = True)

grid.fit(X_train_e, y_train)

print('최적 하이퍼파라미터: ', grid.best_params_)
print('f1: ', grid.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
최적 하이퍼파라미터:  {'learning_rate': 0.04, 'max_depth': 6, 'n_estimators': 400, 'random_state': 29, 'tree_method': 'gpu_hist'}
f1:  0.7593605145279272


In [17]:
submission = pd.read_csv("/content/drive/MyDrive/python file/boostcamp/project2/sample_submission.csv", index_col = ["id"])
y_pred = grid.predict(X_test_e)
submission["target"] = y_pred
submission.to_csv("/content/drive/MyDrive/python file/boostcamp/project2/submission_result.csv")

# TF-IDF 기법

In [34]:
# 정제된 text 데이터 벡터화
stopword = nltk.corpus.stopwords.words('english')

tfidv = TfidfVectorizer(stop_words = stopword)
train_vectors = tfidv.fit_transform(df_train["text punct"])
test_vectors = tfidv.transform(df_test["text punct"])

In [36]:
# 독립변수와 종속변수의 분류
target = "target"
X_train = df_train.drop(columns = [target, "id", "text", "text punct"]).merge(pd.DataFrame(train_vectors.toarray(), columns = tfidv.get_feature_names_out()), left_index = True, right_index = True)
y_train = df_train[target]
X_test = df_test.drop(columns = ["id", "text", "text punct"]).merge(pd.DataFrame(test_vectors.toarray(), columns = tfidv.get_feature_names_out()), left_index = True, right_index = True)

In [37]:
# 특성이 너무 많으므로, 전처리를 통해 중요한 특성 200개만 선별 및 전처리
pipe_pre =  make_pipeline(TargetEncoder(), SimpleImputer(), SelectKBest(chi2, k = 200))

X_train_e = pipe_pre.fit_transform(X_train, y_train)
X_test_e = pipe_pre.transform(X_test)

In [38]:
# XGB 분류기를 사용하여 학습
clf = XGBClassifier()

search_space = [
    {"n_estimators": range(100, 500, 100),
     "max_depth": range(1, 20, 5),
     "learning_rate": np.arange(0.01, 0.05, 0.01),
     "random_state": [29],
     "tree_method": ["gpu_hist"]}  # For using the GPU.
]

# CV = 5
kfold = StratifiedKFold(n_splits=5, random_state=29, shuffle=True)

grid = GridSearchCV(estimator = clf, 
                    param_grid = search_space,
                    cv = kfold,
                    scoring = "f1",  # Accuracy
                    return_train_score = True,
                    n_jobs = -1,
                    verbose = 2,
                    refit = True)

grid.fit(X_train_e, y_train)

print('최적 하이퍼파라미터: ', grid.best_params_)
print('f1: ', grid.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
최적 하이퍼파라미터:  {'learning_rate': 0.03, 'max_depth': 11, 'n_estimators': 300, 'random_state': 29, 'tree_method': 'gpu_hist'}
f1:  0.7429950638423961


In [39]:
submission = pd.read_csv("/content/drive/MyDrive/python file/boostcamp/project2/sample_submission.csv", index_col = ["id"])
y_pred = grid.predict(X_test_e)
submission["target"] = y_pred
submission.to_csv("/content/drive/MyDrive/python file/boostcamp/project2/submission_result.csv")