## Libraries

In [56]:
# Libraries for text preprocessing.
import pandas as pd
import re

import string

import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist

stop = set(stopwords.words('english'))

In [57]:
train = pd.read_csv("test.csv")

In [58]:
train.text[0]

'Just happened a terrible car crash'

## 텍스트 전처리

#### 불필요한 부분들 제거

In [59]:
# Some basic helper functions to clean text by removing urls, emojis, html tags and punctuations.

# 사이트 주소 제거
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

# 이모지 제거 (원래 없음)
def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# html 양식 제거
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

# 구두점 제거
def remove_punct(text):
    punctuation_without_hashtags = string.punctuation.replace('@', '').replace('#', '')
        # 해시태그와 멘션은 별도로 제외함.
    table = str.maketrans('', '', punctuation_without_hashtags)
    return text.translate(table)

# Applying helper functions

train['text_clean'] = train['text'].apply(lambda x: remove_URL(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_emoji(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_html(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_punct(x))

#### 토큰화

In [60]:
# Tokenizing the tweet base texts.
tknzr = TweetTokenizer(reduce_len=True)
    # reduce_len 옵션을 사용하여 cooool => cool 등으로 토큰을 단순화
train['tokenized'] = train['text_clean'].apply(lambda x: tknzr.tokenize(x))
train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]"
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit..."
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,..."


#### 토큰화 결과를 소문자화

In [61]:
# Lower casing clean text.

train['lower'] = train['tokenized'].apply(
    lambda x: [word.lower() for word in x])

train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]"
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit..."
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,..."


#### 토큰에서 불용어 제거

In [62]:
# Removing stopwords.

train['stopwords_removed'] = train['lower'].apply(
    lambda x: [word for word in x if word not in stop])

train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]"
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ..."
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]"


#### nltk.tag.pos_tag()로 품사 태깅

In [63]:
# Applying part of speech tags.

train['pos_tags'] = train['stopwords_removed'].apply(nltk.tag.pos_tag)

train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (..."
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ...","[(heard, RB), (#earthquake, VB), (different, J..."
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[(apocalypse, NN), (lighting, VBG), (#spokane,..."
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ..."


#### 태깅된 것을 wordnet 형식으로 변환

**Adjectives (형용사):**  
NLTK 태그: JJ, JJR, JJS 등으로 시작하는 태그.  
WordNet 태그: wordnet.ADJ로 변환.

**Verbs (동사):**  
NLTK 태그: VB, VBD, VBG, VBN, VBP, VBZ 등으로 시작하는 태그.  
WordNet 태그: wordnet.VERB로 변환.  

**Nouns (명사):**  
NLTK 태그: NN, NNS, NNP, NNPS 등으로 시작하는 태그.  
WordNet 태그: wordnet.NOUN으로 변환.

**Adverbs (부사):**  
NLTK 태그: RB, RBR, RBS 등으로 시작하는 태그.  
WordNet 태그: wordnet.ADV로 변환.  

**Default Case:**  
위의 어떤 경우에도 해당하지 않는 태그는 기본적으로 명사(wordnet.NOUN)로 변환됩니다.

In [64]:
# Converting part of speeches to wordnet format.

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


train['wordnet_pos'] = train['pos_tags'].apply(
    lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (...","[(happened, v), (terrible, a), (car, n), (cras..."
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ...","[(heard, RB), (#earthquake, VB), (different, J...","[(heard, r), (#earthquake, v), (different, a),..."
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...","[(forest, a), (fire, n), (spot, n), (pond, n),..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[(apocalypse, NN), (lighting, VBG), (#spokane,...","[(apocalypse, n), (lighting, v), (#spokane, n)..."
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ...","[(typhoon, n), (soudelor, n), (kills, v), (28,..."


#### 어근 추출

In [65]:
# Applying word lemmatizer.

lemmatizer = WordNetLemmatizer()

train['lemmatized'] = train['wordnet_pos'].apply(
    lambda x: [lemmatizer.lemmatize(word, tag) for word, tag in x])

In [66]:
train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (...","[(happened, v), (terrible, a), (car, n), (cras...","[happen, terrible, car, crash]"
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ...","[(heard, RB), (#earthquake, VB), (different, J...","[(heard, r), (#earthquake, v), (different, a),...","[heard, #earthquake, different, city, stay, sa..."
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...","[(forest, a), (fire, n), (spot, n), (pond, n),...","[forest, fire, spot, pond, geese, flee, across..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[(apocalypse, NN), (lighting, VBG), (#spokane,...","[(apocalypse, n), (lighting, v), (#spokane, n)...","[apocalypse, light, #spokane, #wildfires]"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ...","[(typhoon, n), (soudelor, n), (kills, v), (28,...","[typhoon, soudelor, kill, 28, china, taiwan]"


In [67]:
# 어근 중 불용어가 있는지 확인
train['lemmatized'].apply(lambda x: [print(word) for word in x if word in stop])

there
do
there
he
there
do
he
do
do
re
few
here
there
do
as
he
o
do
there
there
he
as
there
as
do
as
as
who
he
he
as
as
do
as
he
can
do
d
who
there
being
he
as
he
he
he
he
being
as
as
do
there
do
do
there
so
there
here


0       []
1       []
2       []
3       []
4       []
        ..
3258    []
3259    []
3260    []
3261    []
3262    []
Name: lemmatized, Length: 3263, dtype: object

#### 어근 중 불용어 제거 + 문자열로 반환

In [68]:
# 추출한 어근 중 불용어 제거
train['lemmatized'] = train['lemmatized'].apply(
    lambda x: [word for word in x if word not in stop])

train['lemma_str'] = [' '.join(map(str, l)) for l in train['lemmatized']]
    # [' '.join(map(str, l)) for l in train['lemmatized']]: 이 구문은 train['lemmatized']의 각 요소 l에 대해 ' '.join(map(str, l))를 실행하여 새로운 리스트를 생성
    # 리스트 컴프리헨션

train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (...","[(happened, v), (terrible, a), (car, n), (cras...","[happen, terrible, car, crash]",happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ...","[(heard, RB), (#earthquake, VB), (different, J...","[(heard, r), (#earthquake, v), (different, a),...","[heard, #earthquake, different, city, stay, sa...",heard #earthquake different city stay safe eve...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...","[(forest, a), (fire, n), (spot, n), (pond, n),...","[forest, fire, spot, pond, geese, flee, across...",forest fire spot pond geese flee across street...
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[(apocalypse, NN), (lighting, VBG), (#spokane,...","[(apocalypse, n), (lighting, v), (#spokane, n)...","[apocalypse, light, #spokane, #wildfires]",apocalypse light #spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ...","[(typhoon, n), (soudelor, n), (kills, v), (28,...","[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan


In [69]:
# 남은 불용어가 있는지 다시 한번 확인
train['lemmatized'].apply(lambda x: [print(word) for word in x if word in stop])

0       []
1       []
2       []
3       []
4       []
        ..
3258    []
3259    []
3260    []
3261    []
3262    []
Name: lemmatized, Length: 3263, dtype: object

#### 길이가 2 이하인 단어들 제거

In [70]:
def remove_short(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

train['filtered_lemma_str'] = train['lemma_str'].apply(remove_short)
train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (...","[(happened, v), (terrible, a), (car, n), (cras...","[happen, terrible, car, crash]",happen terrible car crash,happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ...","[(heard, RB), (#earthquake, VB), (different, J...","[(heard, r), (#earthquake, v), (different, a),...","[heard, #earthquake, different, city, stay, sa...",heard #earthquake different city stay safe eve...,heard #earthquake different city stay safe eve...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...","[(forest, a), (fire, n), (spot, n), (pond, n),...","[forest, fire, spot, pond, geese, flee, across...",forest fire spot pond geese flee across street...,forest fire spot pond geese flee across street...
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[(apocalypse, NN), (lighting, VBG), (#spokane,...","[(apocalypse, n), (lighting, v), (#spokane, n)...","[apocalypse, light, #spokane, #wildfires]",apocalypse light #spokane #wildfires,apocalypse light #spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ...","[(typhoon, n), (soudelor, n), (kills, v), (28,...","[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan,typhoon soudelor kill china taiwan


#### 키워드를 텍스트 뒤에 이어붙이기

In [71]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  3263 non-null   int64 
 1   keyword             3237 non-null   object
 2   location            2158 non-null   object
 3   text                3263 non-null   object
 4   text_clean          3263 non-null   object
 5   tokenized           3263 non-null   object
 6   lower               3263 non-null   object
 7   stopwords_removed   3263 non-null   object
 8   pos_tags            3263 non-null   object
 9   wordnet_pos         3263 non-null   object
 10  lemmatized          3263 non-null   object
 11  lemma_str           3263 non-null   object
 12  filtered_lemma_str  3263 non-null   object
dtypes: int64(1), object(12)
memory usage: 331.5+ KB


In [72]:
import pandas as pd

# keyword 변수가 빈 값이 아닌 경우, filtered_lemma_str의 뒤에 이어붙이기
train['combined_str'] = train.apply(
    lambda row: str(row['filtered_lemma_str']).strip() + ' ' + str(row['keyword']).replace('%20', '').strip()
    if pd.notna(row['keyword']) and row['keyword'] else str(row['filtered_lemma_str']).strip(), axis=1)

train.head()

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str,combined_str
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]","[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (...","[(happened, v), (terrible, a), (car, n), (cras...","[happen, terrible, car, crash]",happen terrible car crash,happen terrible car crash,happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",Heard about #earthquake is different cities st...,"[Heard, about, #earthquake, is, different, cit...","[heard, about, #earthquake, is, different, cit...","[heard, #earthquake, different, cities, stay, ...","[(heard, RB), (#earthquake, VB), (different, J...","[(heard, r), (#earthquake, v), (different, a),...","[heard, #earthquake, different, city, stay, sa...",heard #earthquake different city stay safe eve...,heard #earthquake different city stay safe eve...,heard #earthquake different city stay safe eve...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...","[(forest, a), (fire, n), (spot, n), (pond, n),...","[forest, fire, spot, pond, geese, flee, across...",forest fire spot pond geese flee across street...,forest fire spot pond geese flee across street...,forest fire spot pond geese flee across street...
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting #Spokane #wildfires,"[Apocalypse, lighting, #Spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[apocalypse, lighting, #spokane, #wildfires]","[(apocalypse, NN), (lighting, VBG), (#spokane,...","[(apocalypse, n), (lighting, v), (#spokane, n)...","[apocalypse, light, #spokane, #wildfires]",apocalypse light #spokane #wildfires,apocalypse light #spokane #wildfires,apocalypse light #spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...","[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ...","[(typhoon, n), (soudelor, n), (kills, v), (28,...","[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan,typhoon soudelor kill china taiwan,typhoon soudelor kill china taiwan


In [73]:
train[40:43]

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str,combined_str
40,125,accident,"Frankfurt, Germany",@DaveOshry @Soembie So if I say that I met her...,@DaveOshry @Soembie So if I say that I met her...,"[@DaveOshry, @Soembie, So, if, I, say, that, I...","[@daveoshry, @soembie, so, if, i, say, that, i...","[@daveoshry, @soembie, say, met, accident, wee...","[(@daveoshry, JJ), (@soembie, NNS), (say, VBP)...","[(@daveoshry, a), (@soembie, n), (say, v), (me...","[@daveoshry, @soembie, say, meet, accident, we...",@daveoshry @soembie say meet accident week wou...,@daveoshry @soembie say meet accident week wou...,@daveoshry @soembie say meet accident week wou...
41,127,accident,"Gresham, OR",ACCIDENT - HIT AND RUN - COLD at 500 BLOCK OF ...,ACCIDENT HIT AND RUN COLD at 500 BLOCK OF SE...,"[ACCIDENT, HIT, AND, RUN, COLD, at, 500, BLOCK...","[accident, hit, and, run, cold, at, 500, block...","[accident, hit, run, cold, 500, block, se, vis...","[(accident, NN), (hit, VBD), (run, VBN), (cold...","[(accident, n), (hit, v), (run, v), (cold, r),...","[accident, hit, run, cold, 500, block, se, vis...",accident hit run cold 500 block se vista ter g...,accident hit run cold 500 block vista ter gres...,accident hit run cold 500 block vista ter gres...
42,140,accident,,@Calum5SOS this happened on accident but I lik...,@Calum5SOS this happened on accident but I lik...,"[@Calum5SOS, this, happened, on, accident, but...","[@calum5sos, this, happened, on, accident, but...","[@calum5sos, happened, accident, like]","[(@calum5sos, NN), (happened, VBD), (accident,...","[(@calum5sos, n), (happened, v), (accident, n)...","[@calum5sos, happen, accident, like]",@calum5sos happen accident like,@calum5sos happen accident like,@calum5sos happen accident like accident


In [74]:
train[train['filtered_lemma_str'].apply(lambda x: len(x))==3]

Unnamed: 0,id,keyword,location,text,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str,combined_str
7,22,,,Hey! How are you?,Hey How are you,"[Hey, How, are, you]","[hey, how, are, you]",[hey],"[(hey, NN)]","[(hey, n)]",[hey],hey,hey,hey


In [75]:
train = train[["id", "combined_str"]]
train.head()

Unnamed: 0,id,combined_str
0,0,happen terrible car crash
1,2,heard #earthquake different city stay safe eve...
2,3,forest fire spot pond geese flee across street...
3,9,apocalypse light #spokane #wildfires
4,11,typhoon soudelor kill china taiwan


In [76]:
train.loc[train.combined_str.apply(len)==0, "combined_str"] = "good"

In [77]:
train.to_csv("test_cleaned.csv", index=False)