### 전처리 과정
0. url과 해시태그 처리(Tweet의 특성상 존재)
1. 토큰화
2. 정제 및 정규화
3. 어간 또는 표제어 추출
4. 불필요한 토큰 제거

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv", encoding="ANSI")

In [3]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Keyword feature

In [4]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
train[train.keyword.isna()].index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
         12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,
         24,   25,   26,   27,   28,   29,   30, 7583, 7584, 7585, 7586, 7587,
       7588, 7589, 7590, 7591, 7592, 7593, 7594, 7595, 7596, 7597, 7598, 7599,
       7600, 7601, 7602, 7603, 7604, 7605, 7606, 7607, 7608, 7609, 7610, 7611,
       7612],
      dtype='int64')

NA값은 처음과 마지막 30개 값들

In [6]:
# 알파벳이 아닌 값을 추출
non_alphabet_values = train['keyword'].str.extractall(r'([^a-zA-Z]+)')[0].unique()

# 결과 출력
non_alphabet_values

array(['%20'], dtype=object)

알파벳이 아닌 값은 %20만 존재한다. 시각적으로 이를 공백으로 대체해야함을 알 수 있다.

In [7]:
train.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [8]:
# %20부분을 공백으로 대체
train.keyword = train.keyword.str.replace('%20', ' ')
train.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'desol

### Location feature

In [9]:
train.location.isna().sum()/len(train)

0.33272034677525286

In [10]:
train.location[~train.location.isna()]

31                         Birmingham
32      Est. September 2012 - Bristol
33                             AFRICA
34                   Philadelphia, PA
35                         London, UK
                    ...              
7575                               TN
7577           #NewcastleuponTyne #UK
7579                Vancouver, Canada
7580                          London 
7581                          Lincoln
Name: location, Length: 5080, dtype: object

In [11]:
train.location.to_csv("location_feature.csv")

#### 문제점
* 전체의 1/3이 결측치
* 지역명의 표기가 획일화되지 않았다. ex) "London, UK"와 "London"

### Text feature

In [12]:
train.text.head().all

<bound method NDFrame._add_numeric_operations.<locals>.all of 0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object>

In [13]:
train[train.text.str.contains("http")]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1
...,...,...,...,...,...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1


#### 이메일과 url 제거

In [14]:
import re

# 정규 표현식을 이용한 텍스트 정제 함수 정의
def clean_text(text):
    # 이메일 제거
    text = re.sub(r'\S+@\S+', '', text)
    # URL 제거
    text = re.sub(r'http\S+|www.\S+', '', text)
    return text

# 데이터프레임의 text 변수에 적용
train['text'] = train['text'].apply(clean_text)

train.text.to_csv("text_feature.csv")

#### 태그로 붙어있는 단어들 중 재난과 무관한 단어 제거

In [15]:
# 특수문자와 붙어있는 단어 찾기 함수 정의
def find_special_words(text):
    # @ 또는 #로 시작하는 단어 패턴 매칭
    matches = re.findall(r'[@#]\w+', text)
    return matches

# 데이터프레임의 text 변수에 적용하여 특수문자와 붙어있는 단어 찾기
train['special_words'] = train['text'].apply(find_special_words)

# 결과 출력
print(train[['text', 'special_words']])

                                                   text  \
0     Our Deeds are the Reason of this #earthquake M...   
1                Forest fire near La Ronge Sask. Canada   
2     All residents asked to 'shelter in place' are ...   
3     13,000 people receive #wildfires evacuation or...   
4     Just got sent this photo from Ruby #Alaska as ...   
...                                                 ...   
7608  Two giant cranes holding a bridge collapse int...   
7609  @aria_ahrary @TheTawniest The out of control w...   
7610        M1.94 [01:04 UTC]?5km S of Volcano Hawaii.    
7611  Police investigating after an e-bike collided ...   
7612  The Latest: More Homes Razed by Northern Calif...   

                     special_words  
0                    [#earthquake]  
1                               []  
2                               []  
3                     [#wildfires]  
4            [#Alaska, #wildfires]  
...                            ...  
7608                            

In [16]:
train.keyword.unique()[1:10]

array(['ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon'], dtype=object)

In [17]:
train['keyword'] = train['keyword'].astype('str')

In [18]:
# 1. keyword 변수의 unique 값을 저장
unique_keywords = set(keyword.replace(" ", "").lower() for keyword in train['keyword'].unique())

# 2. @나 #가 앞에 붙어있는 단어를 찾고, unique 값에 포함되지 않는다면 삭제
def clean_text(text, keywords):
    # 텍스트를 소문자로 변환
    text_lower = text.lower()
    not_matched = []
    
    # @나 #로 시작하는 단어 찾기
    matches = re.findall(r'[@#](\w+)', text_lower)
    
    for match in matches:
        if match not in keywords:
            not_matched.append(match)    
    return not_matched

nm = train.text.apply(lambda x: clean_text(x, unique_keywords))
nm = nm.to_list()

In [19]:
# 리스트 평탄화 (중첩된 리스트의 모든 원소를 하나의 리스트로 합침)
flattened_list = [item for sublist in nm for item in sublist]
flattened_list[1:10]

['rockyfire',
 'cafire',
 'raining',
 'florida',
 'tampabay',
 'tampa',
 'we',
 'breaking',
 'bbcmtd']

In [20]:
from collections import Counter

Counter(flattened_list)

Counter({'youtube': 85,
         'news': 77,
         'hot': 31,
         'prebreak': 30,
         'best': 30,
         'nowplaying': 23,
         'islam': 23,
         'hiroshima': 22,
         'gbbo': 18,
         'emmerdale': 17,
         'jobs': 14,
         'job': 12,
         'isis': 12,
         'world': 11,
         'japan': 11,
         'hiring': 11,
         'arianagrande': 11,
         'bbc': 11,
         'cnn': 11,
         'potus': 10,
         'foxnews': 10,
         'business': 10,
         'india': 10,
         'sismo': 10,
         'yyc': 10,
         'breaking': 9,
         'rt': 9,
         'worldnews': 9,
         'change': 9,
         'directioners': 9,
         'usatoday': 9,
         'irandeal': 9,
         'fashion': 9,
         'abstorm': 9,
         'fukushima': 9,
         'nuclear': 9,
         'edm': 8,
         'dnb': 8,
         'beyhive': 8,
         'tcot': 8,
         'handbag': 8,
         'seattle': 8,
         'justinbieber': 8,
         'genocide':

In [21]:
# 1. keyword 변수의 unique 값을 소문자로 변환하고 공백 제거하여 저장
unique_keywords = set(keyword.replace(" ", "").lower() for keyword in train['keyword'].unique())

# 2. @나 #가 앞에 붙어있는 단어를 찾고, unique 값에 포함되지 않는다면 삭제
def clean_text(text, keywords):
    # 텍스트를 소문자로 변환
    text_lower = text.lower()
    
    # @나 #로 시작하는 단어 찾기
    matches = re.findall(r'[@#](\w+)', text_lower)
    
    for match in matches:
        # match에서 공백 제거 후 비교
        match_cleaned = match.replace(" ", "")
        if match_cleaned not in keywords:
            # 원본 텍스트에서 @나 #와 함께 단어를 삭제 (대소문자 구분 없이)
            text = re.sub(rf'[@#]{re.escape(match)}', '', text, flags=re.IGNORECASE)
    
    # 불필요한 공백 제거
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# 텍스트 변수에 함수 적용
train['cleaned_text'] = train['text'].apply(lambda x: clean_text(x, unique_keywords))

# 결과 출력
print(train[['text', 'cleaned_text']])

                                                   text  \
0     Our Deeds are the Reason of this #earthquake M...   
1                Forest fire near La Ronge Sask. Canada   
2     All residents asked to 'shelter in place' are ...   
3     13,000 people receive #wildfires evacuation or...   
4     Just got sent this photo from Ruby #Alaska as ...   
...                                                 ...   
7608  Two giant cranes holding a bridge collapse int...   
7609  @aria_ahrary @TheTawniest The out of control w...   
7610        M1.94 [01:04 UTC]?5km S of Volcano Hawaii.    
7611  Police investigating after an e-bike collided ...   
7612  The Latest: More Homes Razed by Northern Calif...   

                                           cleaned_text  
0     Our Deeds are the Reason of this #earthquake M...  
1                Forest fire near La Ronge Sask. Canada  
2     All residents asked to 'shelter in place' are ...  
3     13,000 people receive #wildfires evacuation or...  
4

#### 특정 단어를 포함하는지에 따라 재난문자 여부가 구분될 수 있을까?

1. 공식적인 출처의 재난 트윗이 많으므로 'lol', 'fuck' 등의 비공식적인 단어의 target 평균이 낮다.
2. 반면 일상 대화에서 잘 쓰지않는 'UTC'나 'News' 등의 단어의 target 평균이 매우 높다.
3. 또한 소문자와 대문자의 사용 여부에 대해서도 재난문자 비율이 갈린다.

고민: 이런 정성적 요소들을 반영하려면 어떻게 해야할까??