In [111]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
import re

# Download and Load label data from https://crisisnlp.qcri.org/
dataset_name = '2014_california_eq_raw.csv'
data= pd.read_csv(r'Raw_data/%s'%dataset_name, encoding='latin-1')

# remove additoonal columns
data.drop('_golden', axis =1 , inplace = True)
data.drop('_unit_state', axis =1 , inplace = True)
data.drop('_trusted_judgments', axis =1 , inplace = True)
data.drop('_last_judgment_at', axis =1 , inplace = True)
data.drop('choose_one_category_gold', axis =1 , inplace = True)
data.drop('choose_one_category:confidence', axis =1 , inplace = True)
data.drop('_unit_id', axis =1 , inplace = True)
data.drop('tweet_id', axis =1 , inplace = True)
# rename comumns
data = data.rename (columns= {'choose_one_category':'label','tweet_text':'tweet_text'})
data_top = data.head(5)
data_top

Unnamed: 0,label,tweet_text
0,other_useful_information,RT @nicoleewayne: Tennessee USA Knoxville http...
1,infrastructure_and_utilities_damage,RT @SFGate: We're updating this interactive ma...
2,injured_or_dead_people,RT @YourAnonNews: Strong 6.1 Earthquake Rocks ...
3,infrastructure_and_utilities_damage,RT @heyyouapp: Wisconsin USA Madison http://t....
4,other_useful_information,"RT @scullather: ""@infodude: amazing use of #Bi..."


#### Cleaning

In [112]:
 
dataset = []
for i in range(0,len(data)):
        string = str (data['tweet_text'][i])
        x = re.sub("\[\'text:\"RT @.*?: ", "", string)
        x = re.sub("http.*", "",x)
        x = re.sub("\[\'text:", "", x)
        x = re.sub("\].*", "", x)
        x = re.sub("\\\\", "", x)
        x = re.sub("\\'", "", x)
        x = re.sub("RT", "", x)
        x= re.sub("http", "", x)
        x = re.sub(":", "", x)
        x = re.sub("@", "", x)
        x = re.sub("#", "", x)
        x = re.sub("//", "", x)
        data['tweet_text'][i] = ''
        data['tweet_text'][i] =  x  
Text_label_data = data
Text_label_data.to_csv('pre_process_data/Text_label_'+'%s'%dataset_name+'.csv' , encoding='utf-8')

In [113]:
data_top = data.head(5)
data_top

Unnamed: 0,label,tweet_text
0,other_useful_information,nicoleewayne Tennessee USA Knoxville
1,infrastructure_and_utilities_damage,SFGate Were updating this interactive map of ...
2,injured_or_dead_people,YourAnonNews Strong 6.1 Earthquake Rocks San ...
3,infrastructure_and_utilities_damage,heyyouapp Wisconsin USA Madison
4,other_useful_information,"scullather ""infodude amazing use of BigData f..."


#### Noramlization

In [114]:


# Step - 1a : Remove blank rows if any.
data['tweet_text'].dropna(inplace=True)

# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
data['tweet_text'] = [entry.lower() for entry in data['tweet_text']]

# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
data['tweet_text']= [word_tokenize(entry) for entry in data['tweet_text']]

# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


for index,entry in enumerate(data['tweet_text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            word_Final = re.sub("rt", "", word_Final)
            word_Final = re.sub("http", "", word_Final)
            word_Final = re.sub(":", "", word_Final)
            word_Final = re.sub("@", "", word_Final)
            word_Final = re.sub("#", "", word_Final)
            word_Final = re.sub("//", "", word_Final)
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    data.loc[index, 'text_final'] = str(Final_words)
    data['text_final'].dropna(inplace=True)

In [115]:
data_top = data.head(5)
data_top

Unnamed: 0,label,tweet_text,text_final
0,other_useful_information,"[nicoleewayne, tennessee, usa, knoxville]","['nicoleewayne', 'tennessee', 'usa', 'knoxville']"
1,infrastructure_and_utilities_damage,"[sfgate, were, updating, this, interactive, ma...","['sfgate', 'update', 'interactive', 'map', 're..."
2,injured_or_dead_people,"[youranonnews, strong, 6.1, earthquake, rocks,...","['youranonnews', 'strong', 'eahquake', 'rock',..."
3,infrastructure_and_utilities_damage,"[heyyouapp, wisconsin, usa, madison]","['heyyouapp', 'wisconsin', 'usa', 'madison']"
4,other_useful_information,"[scullather, ``, infodude, amazing, use, of, b...","['scullather', 'infodude', 'amazing', 'use', '..."
