## Libraries

In [42]:
# Libraries for text preprocessing.
import pandas as pd
import re

import string

import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist

stop = set(stopwords.words('english'))

In [43]:
train = pd.read_csv("train.csv", encoding="ANSI")
test = pd.read_csv("test.csv")

In [44]:
train.text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

## 텍스트 전처리

#### 불필요한 부분들 제거

In [45]:
# Some basic helper functions to clean text by removing urls, emojis, html tags and punctuations.

# 사이트 주소 제거
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

# 이모지 제거 (원래 없음)
def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# html 양식 제거
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

# 구두점 제거
def remove_punct(text):
    punctuation_without_hashtags = string.punctuation.replace('@', '').replace('#', '')
        # 해시태그와 멘션은 별도로 제외함.
    table = str.maketrans('', '', punctuation_without_hashtags)
    return text.translate(table)

# Applying helper functions

train['text_clean'] = train['text'].apply(lambda x: remove_URL(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_emoji(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_html(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_punct(x))

#### 토큰화

In [46]:
# Tokenizing the tweet base texts.
tknzr = TweetTokenizer(reduce_len=True)
    # reduce_len 옵션을 사용하여 cooool => cool 등으로 토큰을 단순화
train['tokenized'] = train['text_clean'].apply(lambda x: tknzr.tokenize(x))
train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al..."


#### 토큰화 결과를 소문자화

In [47]:
# Lower casing clean text.

train['lower'] = train['tokenized'].apply(
    lambda x: [word.lower() for word in x])

train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al..."


#### 토큰에서 불용어 제거

In [48]:
# Removing stopwords.

train['stopwords_removed'] = train['lower'].apply(
    lambda x: [word for word in x if word not in stop])

train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild..."


#### nltk.tag.pos_tag()로 품사 태깅

In [49]:
# Applying part of speech tags.

train['pos_tags'] = train['stopwords_removed'].apply(nltk.tag.pos_tag)

train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi...","[(deeds, NNS), (reason, NN), (#earthquake, VBP..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J..."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[(13000, CD), (people, NNS), (receive, JJ), (#..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N..."


#### 태깅된 것을 wordnet 형식으로 변환

**Adjectives (형용사):**  
NLTK 태그: JJ, JJR, JJS 등으로 시작하는 태그.  
WordNet 태그: wordnet.ADJ로 변환.

**Verbs (동사):**  
NLTK 태그: VB, VBD, VBG, VBN, VBP, VBZ 등으로 시작하는 태그.  
WordNet 태그: wordnet.VERB로 변환.  

**Nouns (명사):**  
NLTK 태그: NN, NNS, NNP, NNPS 등으로 시작하는 태그.  
WordNet 태그: wordnet.NOUN으로 변환.

**Adverbs (부사):**  
NLTK 태그: RB, RBR, RBS 등으로 시작하는 태그.  
WordNet 태그: wordnet.ADV로 변환.  

**Default Case:**  
위의 어떤 경우에도 해당하지 않는 태그는 기본적으로 명사(wordnet.NOUN)로 변환됩니다.

In [50]:
# Converting part of speeches to wordnet format.

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


train['wordnet_pos'] = train['pos_tags'].apply(
    lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi...","[(deeds, NNS), (reason, NN), (#earthquake, VBP...","[(deeds, n), (reason, n), (#earthquake, v), (m..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J...","[(forest, a), (fire, n), (near, n), (la, a), (..."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)...","[(residents, n), (asked, v), (shelter, a), (pl..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[(13000, CD), (people, NNS), (receive, JJ), (#...","[(13000, n), (people, n), (receive, a), (#wild..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...","[(got, v), (sent, a), (photo, n), (ruby, n), (..."


#### 어근 추출

In [51]:
# Applying word lemmatizer.

lemmatizer = WordNetLemmatizer()

train['lemmatized'] = train['wordnet_pos'].apply(
    lambda x: [lemmatizer.lemmatize(word, tag) for word, tag in x])

In [52]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi...","[(deeds, NNS), (reason, NN), (#earthquake, VBP...","[(deeds, n), (reason, n), (#earthquake, v), (m...","[deed, reason, #earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J...","[(forest, a), (fire, n), (near, n), (la, a), (...","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)...","[(residents, n), (asked, v), (shelter, a), (pl...","[resident, ask, shelter, place, notify, office..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[(13000, CD), (people, NNS), (receive, JJ), (#...","[(13000, n), (people, n), (receive, a), (#wild...","[13000, people, receive, #wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...","[(got, v), (sent, a), (photo, n), (ruby, n), (...","[get, sent, photo, ruby, #alaska, smoke, #wild..."


In [53]:
# 어근 중 불용어가 있는지 확인
train['lemmatized'].apply(lambda x: [print(word) for word in x if word in stop])

there
do
there
he
who
he
here
there
there
do
who
here
he
there
m
he
he
do
there
there
there
do
he
he
as
as
do
he
as
as
he
as
there
as
down
he
do
he
here
d
out
do
he
he
do
as
as
as
here
as
there
there
he
do
there
there
d
he
as
there
do
there
so
do
can
as
as
as
there
there
do
he
as
there
there
he
there
he
there
do
he
he
there
as
as
do
he
there
he
m
ma
o
o
o
who
do
he
do
as
who
do
he
do
he
do
he
here
do
do
do
as
as
here
he
he
here
do
as
do
m
as
he
he
as
as
off
there
there
there
he
do
he
he
there
there
there
he
do
do
he
he
as
there
as
he
he
as
there
do
do
as
there
he
he
m
there
he


0       []
1       []
2       []
3       []
4       []
        ..
7608    []
7609    []
7610    []
7611    []
7612    []
Name: lemmatized, Length: 7613, dtype: object

#### 어근 중 불용어 제거 + 문자열로 반환

In [54]:
# 추출한 어근 중 불용어 제거
train['lemmatized'] = train['lemmatized'].apply(
    lambda x: [word for word in x if word not in stop])

train['lemma_str'] = [' '.join(map(str, l)) for l in train['lemmatized']]
    # [' '.join(map(str, l)) for l in train['lemmatized']]: 이 구문은 train['lemmatized']의 각 요소 l에 대해 ' '.join(map(str, l))를 실행하여 새로운 리스트를 생성
    # 리스트 컴프리헨션

train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi...","[(deeds, NNS), (reason, NN), (#earthquake, VBP...","[(deeds, n), (reason, n), (#earthquake, v), (m...","[deed, reason, #earthquake, may, allah, forgiv...",deed reason #earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J...","[(forest, a), (fire, n), (near, n), (la, a), (...","[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)...","[(residents, n), (asked, v), (shelter, a), (pl...","[resident, ask, shelter, place, notify, office...",resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[(13000, CD), (people, NNS), (receive, JJ), (#...","[(13000, n), (people, n), (receive, a), (#wild...","[13000, people, receive, #wildfires, evacuatio...",13000 people receive #wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...","[(got, v), (sent, a), (photo, n), (ruby, n), (...","[get, sent, photo, ruby, #alaska, smoke, #wild...",get sent photo ruby #alaska smoke #wildfires p...


In [55]:
# 남은 불용어가 있는지 다시 한번 확인
train['lemmatized'].apply(lambda x: [print(word) for word in x if word in stop])

0       []
1       []
2       []
3       []
4       []
        ..
7608    []
7609    []
7610    []
7611    []
7612    []
Name: lemmatized, Length: 7613, dtype: object

#### 길이가 2 이하인 단어들 제거

In [56]:
def remove_short(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

train['filtered_lemma_str'] = train['lemma_str'].apply(remove_short)
train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi...","[(deeds, NNS), (reason, NN), (#earthquake, VBP...","[(deeds, n), (reason, n), (#earthquake, v), (m...","[deed, reason, #earthquake, may, allah, forgiv...",deed reason #earthquake may allah forgive u,deed reason #earthquake may allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J...","[(forest, a), (fire, n), (near, n), (la, a), (...","[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada,forest fire near ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)...","[(residents, n), (asked, v), (shelter, a), (pl...","[resident, ask, shelter, place, notify, office...",resident ask shelter place notify officer evac...,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[(13000, CD), (people, NNS), (receive, JJ), (#...","[(13000, n), (people, n), (receive, a), (#wild...","[13000, people, receive, #wildfires, evacuatio...",13000 people receive #wildfires evacuation ord...,13000 people receive #wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...","[(got, v), (sent, a), (photo, n), (ruby, n), (...","[get, sent, photo, ruby, #alaska, smoke, #wild...",get sent photo ruby #alaska smoke #wildfires p...,get sent photo ruby #alaska smoke #wildfires p...


#### 키워드를 텍스트 뒤에 이어붙이기

In [57]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  7613 non-null   int64 
 1   keyword             7552 non-null   object
 2   location            5080 non-null   object
 3   text                7613 non-null   object
 4   target              7613 non-null   int64 
 5   text_clean          7613 non-null   object
 6   tokenized           7613 non-null   object
 7   lower               7613 non-null   object
 8   stopwords_removed   7613 non-null   object
 9   pos_tags            7613 non-null   object
 10  wordnet_pos         7613 non-null   object
 11  lemmatized          7613 non-null   object
 12  lemma_str           7613 non-null   object
 13  filtered_lemma_str  7613 non-null   object
dtypes: int64(2), object(12)
memory usage: 832.8+ KB


In [58]:
import pandas as pd

# keyword 변수가 빈 값이 아닌 경우, filtered_lemma_str의 뒤에 이어붙이기
train['combined_str'] = train.apply(
    lambda row: str(row['filtered_lemma_str']).strip() + ' ' + str(row['keyword']).replace('%20', '').strip()
    if pd.notna(row['keyword']) and row['keyword'] else str(row['filtered_lemma_str']).strip(), axis=1)

train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str,combined_str
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deeds, are, the, reason, of, this, #eart...","[deeds, reason, #earthquake, may, allah, forgi...","[(deeds, NNS), (reason, NN), (#earthquake, VBP...","[(deeds, n), (reason, n), (#earthquake, v), (m...","[deed, reason, #earthquake, may, allah, forgiv...",deed reason #earthquake may allah forgive u,deed reason #earthquake may allah forgive,deed reason #earthquake may allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J...","[(forest, a), (fire, n), (near, n), (la, a), (...","[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada,forest fire near ronge sask canada,forest fire near ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[(residents, NNS), (asked, VBD), (shelter, JJ)...","[(residents, n), (asked, v), (shelter, a), (pl...","[resident, ask, shelter, place, notify, office...",resident ask shelter place notify officer evac...,resident ask shelter place notify officer evac...,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...,"[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[13000, people, receive, #wildfires, evacuatio...","[(13000, CD), (people, NNS), (receive, JJ), (#...","[(13000, n), (people, n), (receive, a), (#wild...","[13000, people, receive, #wildfires, evacuatio...",13000 people receive #wildfires evacuation ord...,13000 people receive #wildfires evacuation ord...,13000 people receive #wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, this, photo, from, ruby, #al...","[got, sent, photo, ruby, #alaska, smoke, #wild...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...","[(got, v), (sent, a), (photo, n), (ruby, n), (...","[get, sent, photo, ruby, #alaska, smoke, #wild...",get sent photo ruby #alaska smoke #wildfires p...,get sent photo ruby #alaska smoke #wildfires p...,get sent photo ruby #alaska smoke #wildfires p...


In [59]:
train[40:43]

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str,combined_str
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,Check these out #nsfw,"[Check, these, out, #nsfw]","[check, these, out, #nsfw]","[check, #nsfw]","[(check, NN), (#nsfw, NN)]","[(check, n), (#nsfw, n)]","[check, #nsfw]",check #nsfw,check #nsfw,check #nsfw ablaze
41,61,ablaze,,on the outside you're ablaze and alive\nbut yo...,0,on the outside youre ablaze and alive\nbut you...,"[on, the, outside, youre, ablaze, and, alive, ...","[on, the, outside, youre, ablaze, and, alive, ...","[outside, youre, ablaze, alive, youre, dead, i...","[(outside, JJ), (youre, NN), (ablaze, JJ), (al...","[(outside, a), (youre, n), (ablaze, a), (alive...","[outside, youre, ablaze, alive, youre, dead, i...",outside youre ablaze alive youre dead inside,outside youre ablaze alive youre dead inside,outside youre ablaze alive youre dead inside a...
42,62,ablaze,milky way,Had an awesome time visiting the CFC head offi...,0,Had an awesome time visiting the CFC head offi...,"[Had, an, awesome, time, visiting, the, CFC, h...","[had, an, awesome, time, visiting, the, cfc, h...","[awesome, time, visiting, cfc, head, office, a...","[(awesome, JJ), (time, NN), (visiting, VBG), (...","[(awesome, a), (time, n), (visiting, v), (cfc,...","[awesome, time, visit, cfc, head, office, anco...",awesome time visit cfc head office ancop site ...,awesome time visit cfc head office ancop site ...,awesome time visit cfc head office ancop site ...


In [60]:
train[train['filtered_lemma_str'].apply(lambda x: len(x))==3]

Unnamed: 0,id,keyword,location,text,target,text_clean,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str,filtered_lemma_str,combined_str
30,44,,,The end!,0,The end,"[The, end]","[the, end]",[end],"[(end, NN)]","[(end, n)]",[end],end,end,end


In [63]:
train_for_NLP = train[["id", "combined_str", "target"]]
train_for_NLP.head()

Unnamed: 0,id,combined_str,target
0,1,deed reason #earthquake may allah forgive,1
1,4,forest fire near ronge sask canada,1
2,5,resident ask shelter place notify officer evac...,1
3,6,13000 people receive #wildfires evacuation ord...,1
4,7,get sent photo ruby #alaska smoke #wildfires p...,1


In [61]:
#train.to_csv("train_cleaned.csv", index=False)

In [65]:
train_for_NLP.to_csv("train_for_NLP.csv", index=False)