# DataSet Preprocessing

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from collections import Counter, OrderedDict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv("DataSet/Airline-Sentiment-2-w-AA.csv", encoding= 'unicode_escape')

In [3]:
len(df["text"])

14640

## 1 - Data Cleaning:
- Replace the mentions with the tag `@mention`
- Replace URL with the tag `@url`
- Replace hashtag with the tag `@hashtag`
- Remove strange characters like \x89ÛÏ that corresponds to apex
- Replace strange characters and emoji with the tag `@emoji`
- Punctuation removal
- Replace numbers with the tag `@number`
- Convert to lowercase
- Over space removal

In [4]:
punctaction = re.sub("@", "", string.punctuation)
punctaction

'!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'

In [5]:
def clean_text(corpus):
    clear_text = []
    for phrase in corpus:
        # Replace the mentions with the tag @mention
        phrase = re.sub("@\S+", "@mention ", phrase)
        # Replace URL with the tag @url
        phrase = re.sub("https*\S+", " @url", phrase)
        # Replace hashtag with the tag @hashtag
        phrase = re.sub("#\S+", " @hashtag", phrase)
        # Remove strange characters
        phrase = re.sub("\x89Ûª|\x89Û\x9d|\x89Ûª|\x89Û÷|\x89Û_", "", phrase)
    
        # Replace strange emoji with the tag @emoji
        phrase = re.sub("\x89\S+", " @emoji", phrase)
        phrase = re.sub("\x95\S+", " @emoji", phrase)
        phrase = re.sub("_Ù\S+", " @emoji", phrase)
        phrase = re.sub("÷\S+", " @emoji", phrase)
        # Replace light emoji like :-D ;D ;p
        phrase = re.sub(":+[';()-/|DPXp]\S*|;+['()-/|DPXp]\S*", " @emoji", phrase)

        # Punctuaction removal
        phrase = re.sub('[%s]' % re.escape(punctaction), ' ', phrase)
        # Replace numbers with the tag @number
        phrase = re.sub("\d+", " @number ", phrase)
        # Convert to lowercase
        phrase = phrase.lower()
        # Replace the over spaces
        phrase = re.sub('\s{2,}', " ", phrase)
        clear_text.append([phrase])
    return clear_text

In [6]:
cleared_text = clean_text(df["text"])

In [7]:
cleared_text

[['@mention what @mention said '],
 ['@mention plus you ve added commercials to the experience tacky '],
 ['@mention i didn t today must mean i need to take another trip '],
 ['@mention it s really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse'],
 ['@mention and it s a really big bad thing about it'],
 ['@mention seriously would pay @number a flight for seats that didn t have this playing it s really the only bad thing about flying va'],
 ['@mention yes nearly every time i fly vx this @emoji worm wont go away @emoji'],
 ['@mention really missed a prime opportunity for men without hats parody there @url'],
 ['@mention well i didn tbut now i do @emoji'],
 ['@mention it was amazing and arrived an hour early you re too good to me '],
 ['@mention did you know that suicide is the second leading cause of death among teens @number @number '],
 ['@mention i lt @number pretty graphics so much better than minimal iconography @emoji'],
 ['@mention t

In [8]:
df['cleared_text'] = np.array(cleared_text)

In [9]:
df['cleared_text']

0                             @mention what @mention said 
1        @mention plus you ve added commercials to the ...
2        @mention i didn t today must mean i need to ta...
3        @mention it s really aggressive to blast obnox...
4        @mention and it s a really big bad thing about it
                               ...                        
14635    @mention thank you we got on a different fligh...
14636    @mention leaving over @number minutes late fli...
14637    @mention please bring american airlines to @ha...
14638    @mention you have my money you change my fligh...
14639    @mention we have @number ppl so we need @numbe...
Name: cleared_text, Length: 14640, dtype: object

In [10]:
new_df = df[["text", "airline_sentiment", "negativereason", "airline", "cleared_text"]].copy()

In [11]:
new_df.head()

Unnamed: 0,text,airline_sentiment,negativereason,airline,cleared_text
0,@VirginAmerica What @dhepburn said.,neutral,,Virgin America,@mention what @mention said
1,@VirginAmerica plus you've added commercials t...,positive,,Virgin America,@mention plus you ve added commercials to the ...
2,@VirginAmerica I didn't today... Must mean I n...,neutral,,Virgin America,@mention i didn t today must mean i need to ta...
3,@VirginAmerica it's really aggressive to blast...,negative,Bad Flight,Virgin America,@mention it s really aggressive to blast obnox...
4,@VirginAmerica and it's a really big bad thing...,negative,Can't Tell,Virgin America,@mention and it s a really big bad thing about it


## 2 - Tokenization:

In [12]:
"""
word_tokens = []
for phrase in df["cleared_text"]:
    word_tokens.append(word_tokenize(phrase))
"""

'\nword_tokens = []\nfor phrase in df["cleared_text"]:\n    word_tokens.append(word_tokenize(phrase))\n'

In [13]:
word_tokens = []
for phrase in df["cleared_text"]:
    word_tokens.append(phrase.split())

In [14]:
word_tokens

[['@mention', 'what', '@mention', 'said'],
 ['@mention',
  'plus',
  'you',
  've',
  'added',
  'commercials',
  'to',
  'the',
  'experience',
  'tacky'],
 ['@mention',
  'i',
  'didn',
  't',
  'today',
  'must',
  'mean',
  'i',
  'need',
  'to',
  'take',
  'another',
  'trip'],
 ['@mention',
  'it',
  's',
  'really',
  'aggressive',
  'to',
  'blast',
  'obnoxious',
  'entertainment',
  'in',
  'your',
  'guests',
  'faces',
  'amp',
  'they',
  'have',
  'little',
  'recourse'],
 ['@mention',
  'and',
  'it',
  's',
  'a',
  'really',
  'big',
  'bad',
  'thing',
  'about',
  'it'],
 ['@mention',
  'seriously',
  'would',
  'pay',
  '@number',
  'a',
  'flight',
  'for',
  'seats',
  'that',
  'didn',
  't',
  'have',
  'this',
  'playing',
  'it',
  's',
  'really',
  'the',
  'only',
  'bad',
  'thing',
  'about',
  'flying',
  'va'],
 ['@mention',
  'yes',
  'nearly',
  'every',
  'time',
  'i',
  'fly',
  'vx',
  'this',
  '@emoji',
  'worm',
  'wont',
  'go',
  'away',
  '

## 3 - Remove Stopwords:

In [15]:
stop_words = stopwords.words('english')

In [16]:
# Removing quotes
stop_words_wq = []
for w in stop_words:
    w = re.sub("\'", "", w)
    stop_words_wq.append(w)

In [17]:
stop_words_wq

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'youre',
 'youve',
 'youll',
 'youd',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'shes',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'thatll',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'fe

In [18]:
no_sw=[]
for phrase in word_tokens:
    no_sw_tmp=[]
    for word in phrase:
        if word not in stop_words_wq:
            no_sw_tmp.append(word)
    no_sw.append(no_sw_tmp)

In [19]:
no_sw

[['@mention', '@mention', 'said'],
 ['@mention', 'plus', 'added', 'commercials', 'experience', 'tacky'],
 ['@mention', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip'],
 ['@mention',
  'really',
  'aggressive',
  'blast',
  'obnoxious',
  'entertainment',
  'guests',
  'faces',
  'amp',
  'little',
  'recourse'],
 ['@mention', 'really', 'big', 'bad', 'thing'],
 ['@mention',
  'seriously',
  'would',
  'pay',
  '@number',
  'flight',
  'seats',
  'playing',
  'really',
  'bad',
  'thing',
  'flying',
  'va'],
 ['@mention',
  'yes',
  'nearly',
  'every',
  'time',
  'fly',
  'vx',
  '@emoji',
  'worm',
  'go',
  'away',
  '@emoji'],
 ['@mention',
  'really',
  'missed',
  'prime',
  'opportunity',
  'men',
  'without',
  'hats',
  'parody',
  '@url'],
 ['@mention', 'well', 'tbut', '@emoji'],
 ['@mention', 'amazing', 'arrived', 'hour', 'early', 'good'],
 ['@mention',
  'know',
  'suicide',
  'second',
  'leading',
  'cause',
  'death',
  'among',
  'teens',
  '@number',
  '@nu

In [20]:
new_df["no_sw"] = np.array(no_sw)

In [21]:
new_df.head()

Unnamed: 0,text,airline_sentiment,negativereason,airline,cleared_text,no_sw
0,@VirginAmerica What @dhepburn said.,neutral,,Virgin America,@mention what @mention said,"[@mention, @mention, said]"
1,@VirginAmerica plus you've added commercials t...,positive,,Virgin America,@mention plus you ve added commercials to the ...,"[@mention, plus, added, commercials, experienc..."
2,@VirginAmerica I didn't today... Must mean I n...,neutral,,Virgin America,@mention i didn t today must mean i need to ta...,"[@mention, today, must, mean, need, take, anot..."
3,@VirginAmerica it's really aggressive to blast...,negative,Bad Flight,Virgin America,@mention it s really aggressive to blast obnox...,"[@mention, really, aggressive, blast, obnoxiou..."
4,@VirginAmerica and it's a really big bad thing...,negative,Can't Tell,Virgin America,@mention and it s a really big bad thing about it,"[@mention, really, big, bad, thing]"


## 4 - Remove short words 
 with len == 1

In [22]:
no_sw_ll = []
for i in new_df["no_sw"]:
    tmp = []
    for word in i:
        if len(word) > 1:
            tmp.append(word)
    no_sw_ll.append(tmp)

In [23]:
no_sw_ll

[['@mention', '@mention', 'said'],
 ['@mention', 'plus', 'added', 'commercials', 'experience', 'tacky'],
 ['@mention', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip'],
 ['@mention',
  'really',
  'aggressive',
  'blast',
  'obnoxious',
  'entertainment',
  'guests',
  'faces',
  'amp',
  'little',
  'recourse'],
 ['@mention', 'really', 'big', 'bad', 'thing'],
 ['@mention',
  'seriously',
  'would',
  'pay',
  '@number',
  'flight',
  'seats',
  'playing',
  'really',
  'bad',
  'thing',
  'flying',
  'va'],
 ['@mention',
  'yes',
  'nearly',
  'every',
  'time',
  'fly',
  'vx',
  '@emoji',
  'worm',
  'go',
  'away',
  '@emoji'],
 ['@mention',
  'really',
  'missed',
  'prime',
  'opportunity',
  'men',
  'without',
  'hats',
  'parody',
  '@url'],
 ['@mention', 'well', 'tbut', '@emoji'],
 ['@mention', 'amazing', 'arrived', 'hour', 'early', 'good'],
 ['@mention',
  'know',
  'suicide',
  'second',
  'leading',
  'cause',
  'death',
  'among',
  'teens',
  '@number',
  '@nu

In [24]:
new_df["not_shortw"] = no_sw_ll

In [25]:
new_df.head()

Unnamed: 0,text,airline_sentiment,negativereason,airline,cleared_text,no_sw,not_shortw
0,@VirginAmerica What @dhepburn said.,neutral,,Virgin America,@mention what @mention said,"[@mention, @mention, said]","[@mention, @mention, said]"
1,@VirginAmerica plus you've added commercials t...,positive,,Virgin America,@mention plus you ve added commercials to the ...,"[@mention, plus, added, commercials, experienc...","[@mention, plus, added, commercials, experienc..."
2,@VirginAmerica I didn't today... Must mean I n...,neutral,,Virgin America,@mention i didn t today must mean i need to ta...,"[@mention, today, must, mean, need, take, anot...","[@mention, today, must, mean, need, take, anot..."
3,@VirginAmerica it's really aggressive to blast...,negative,Bad Flight,Virgin America,@mention it s really aggressive to blast obnox...,"[@mention, really, aggressive, blast, obnoxiou...","[@mention, really, aggressive, blast, obnoxiou..."
4,@VirginAmerica and it's a really big bad thing...,negative,Can't Tell,Virgin America,@mention and it s a really big bad thing about it,"[@mention, really, big, bad, thing]","[@mention, really, big, bad, thing]"


## 5 - Lemmatization:
1) Lemmatization words in new_df["not_shortw"]

In [26]:
lemmatizer = WordNetLemmatizer()

In [27]:
def lemmatization(data, lemmatizer):
    new_words_lemm=[]
    for row in data:
        new_words_lemm_tmp=[]
        for word in row:
            lemm_word = lemmatizer.lemmatize(word)
            lemm_word = lemmatizer.lemmatize(lemm_word, pos='v')
            new_words_lemm_tmp.append(lemm_word)
        new_words_lemm.append(new_words_lemm_tmp)
    return new_words_lemm

In [28]:
new_df["lemmatized"] = lemmatization(new_df["not_shortw"], lemmatizer)

In [29]:
new_df.head()

Unnamed: 0,text,airline_sentiment,negativereason,airline,cleared_text,no_sw,not_shortw,lemmatized
0,@VirginAmerica What @dhepburn said.,neutral,,Virgin America,@mention what @mention said,"[@mention, @mention, said]","[@mention, @mention, said]","[@mention, @mention, say]"
1,@VirginAmerica plus you've added commercials t...,positive,,Virgin America,@mention plus you ve added commercials to the ...,"[@mention, plus, added, commercials, experienc...","[@mention, plus, added, commercials, experienc...","[@mention, plus, add, commercial, experience, ..."
2,@VirginAmerica I didn't today... Must mean I n...,neutral,,Virgin America,@mention i didn t today must mean i need to ta...,"[@mention, today, must, mean, need, take, anot...","[@mention, today, must, mean, need, take, anot...","[@mention, today, must, mean, need, take, anot..."
3,@VirginAmerica it's really aggressive to blast...,negative,Bad Flight,Virgin America,@mention it s really aggressive to blast obnox...,"[@mention, really, aggressive, blast, obnoxiou...","[@mention, really, aggressive, blast, obnoxiou...","[@mention, really, aggressive, blast, obnoxiou..."
4,@VirginAmerica and it's a really big bad thing...,negative,Can't Tell,Virgin America,@mention and it s a really big bad thing about it,"[@mention, really, big, bad, thing]","[@mention, really, big, bad, thing]","[@mention, really, big, bad, thing]"


In [30]:
new_df.columns

Index(['text', 'airline_sentiment', 'negativereason', 'airline',
       'cleared_text', 'no_sw', 'not_shortw', 'lemmatized'],
      dtype='object')

In [31]:
new_col = []
for words in new_df["lemmatized"]:
    string_col = ""
    for word in words:
        string_col = string_col + word + " "
    new_col.append(string_col)

In [32]:
new_df["preprocessed_text"] = new_col

In [33]:
new_df['length'] = [len(t) for t in new_df.preprocessed_text]
new_df.head()

Unnamed: 0,text,airline_sentiment,negativereason,airline,cleared_text,no_sw,not_shortw,lemmatized,preprocessed_text,length
0,@VirginAmerica What @dhepburn said.,neutral,,Virgin America,@mention what @mention said,"[@mention, @mention, said]","[@mention, @mention, said]","[@mention, @mention, say]",@mention @mention say,22
1,@VirginAmerica plus you've added commercials t...,positive,,Virgin America,@mention plus you ve added commercials to the ...,"[@mention, plus, added, commercials, experienc...","[@mention, plus, added, commercials, experienc...","[@mention, plus, add, commercial, experience, ...",@mention plus add commercial experience tacky,46
2,@VirginAmerica I didn't today... Must mean I n...,neutral,,Virgin America,@mention i didn t today must mean i need to ta...,"[@mention, today, must, mean, need, take, anot...","[@mention, today, must, mean, need, take, anot...","[@mention, today, must, mean, need, take, anot...",@mention today must mean need take another trip,48
3,@VirginAmerica it's really aggressive to blast...,negative,Bad Flight,Virgin America,@mention it s really aggressive to blast obnox...,"[@mention, really, aggressive, blast, obnoxiou...","[@mention, really, aggressive, blast, obnoxiou...","[@mention, really, aggressive, blast, obnoxiou...",@mention really aggressive blast obnoxious ent...,88
4,@VirginAmerica and it's a really big bad thing...,negative,Can't Tell,Virgin America,@mention and it s a really big bad thing about it,"[@mention, really, big, bad, thing]","[@mention, really, big, bad, thing]","[@mention, really, big, bad, thing]",@mention really big bad thing,30


## 6 - Store preprocessed data

In [34]:
def delet_tag(corpus):
    clear_text = []
    for phrase in corpus:
        # Replace the @
        phrase = re.sub("@\S+", "", phrase)
        clear_text.append(phrase)
    return clear_text

In [35]:
new_df["not_tag"] = delet_tag(new_df["preprocessed_text"])

In [36]:
final_df = pd.DataFrame({"original_text": new_df["text"], 
                         "preprocessed_text": new_df["preprocessed_text"], 
                         "length_text": new_df["length"],
                         "not_tag_text": new_df["not_tag"],
                         "airline": new_df["airline"], 
                         "airline_sentiment": new_df["airline_sentiment"], 
                         "negative_reason": new_df["negativereason"]})

In [37]:
final_df

Unnamed: 0,original_text,preprocessed_text,length_text,not_tag_text,airline,airline_sentiment,negative_reason
0,@VirginAmerica What @dhepburn said.,@mention @mention say,22,say,Virgin America,neutral,
1,@VirginAmerica plus you've added commercials t...,@mention plus add commercial experience tacky,46,plus add commercial experience tacky,Virgin America,positive,
2,@VirginAmerica I didn't today... Must mean I n...,@mention today must mean need take another trip,48,today must mean need take another trip,Virgin America,neutral,
3,@VirginAmerica it's really aggressive to blast...,@mention really aggressive blast obnoxious ent...,88,really aggressive blast obnoxious entertainme...,Virgin America,negative,Bad Flight
4,@VirginAmerica and it's a really big bad thing...,@mention really big bad thing,30,really big bad thing,Virgin America,negative,Can't Tell
...,...,...,...,...,...,...,...
14635,@AmericanAir thank you we got on a different f...,@mention thank get different flight chicago,44,thank get different flight chicago,American,positive,
14636,@AmericanAir leaving over 20 minutes Late Flig...,@mention leave @number minute late flight warn...,113,leave minute late flight warn communication ...,American,negative,Customer Service Issue
14637,@AmericanAir Please bring American Airlines to...,@mention please bring american airline @hashtag,48,please bring american airline,American,neutral,
14638,"@AmericanAir you have my money, you change my ...",@mention money change flight answer phone sugg...,69,money change flight answer phone suggestion m...,American,negative,Customer Service Issue


In [38]:
final_df.to_csv("DataSet/final_dataset.csv", index=False)