# Data encoding and decoding 
## Part 2 of the series of notebooks to solve the competition


## Content:
  - [Import required modules](#Import%20required%20modules)
  - [load data](#load%20data)
  - [Data cleaning](#Data cleaning)
  - [Encoding](#Encoding)
  - [Decoding](#Decoding)


### Import required modules

In [7]:
# jupyter majec function to print images inlined
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing
from nltk.corpus import stopwords # load stoping words
from nltk.tokenize import word_tokenize # word tokenizer
import pickle # to save clean data

import re # Regular expression

### load data

In [2]:
dataPath = 'dataset'

#Training data
train = pd.read_csv(dataPath+'/train.csv')
# Testing data 
test = pd.read_csv(dataPath+'/test.csv')

for col in train.columns:
    train[col] = train[col].astype(str)
for col in test.columns:
    test[col] = test[col].astype(str)

### Data cleaning

In [3]:
def get_char_only(text):
    chars = re.compile(r"[^a-zA-Z]")
    return chars.sub(r' ',text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    text=html.sub(r'',text)
    return text
    
def remove_stoping_words(text):
    stop=set(stopwords.words('english'))
    return " ".join([x for x in word_tokenize(text) if x not in stop])

# def remove_stoping_words(data):
#     return [ remove_stopwords_statment(i) for i in data]

def remove_emoji(text): 
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_tag(text):
    return ' '.join(re.sub("[@][\w_-]+"," ",text).split())

def strp(text):
    return text.strip()

def lower(text):
    return text.lower()

def clean_data(data): # data must by list only
    data= data.apply(lambda x : remove_html(x))
    data= data.apply(lambda x : remove_URL(x))
    data= data.apply(lambda x : remove_emoji(x))
    data= data.apply(lambda x : remove_tag(x))
    data= data.apply(lambda x : get_char_only(x))
    data= data.apply(lambda x : remove_stoping_words(x))    
    return data

def clean_train():
    train_df=train.copy()
    train_df.text=clean_data(data=train.text)
    train_df.selected_text=clean_data(train.selected_text)
    return train_df
    
def clean_test():
    test_df=test.copy()
    test_df.text=clean_data(test.text)
    return test_df
    
train_clean = clean_train()
test_clean = clean_test()

print("Finished cleaning")

Finished cleaning


### Encoding

We currently use char one hot encoding

In [10]:
oneHotBase=np.zeros(265)
def oneHot(i):
    cop=oneHotBase.copy()
    cop[i]=1
    return cop 
chr2vec={chr(i): oneHot(i) for i in range(256)}

def encode(text):
    return [chr2vec[i] for i in text]

train_encoded=train.copy()
train_encoded.text=train.text.apply(lambda x:encode(x))
train_encoded.selected_text=train.selected_text.apply(lambda x:encode(x))

train_clean_encoded=train_clean.copy()
train_clean_encoded.text=train_clean.text.apply(lambda x:encode(x))
train_clean_encoded.text=train_clean.text.apply(lambda x:encode(x))
train_clean_encoded.selected_text=train_clean.selected_text.apply(lambda x:encode(x))

test_encoded=test.copy()
test_encoded.text=test.text.apply(lambda x:encode(x))

test_clean_encoded=test_clean.copy()
test_clean_encoded.text=test_clean.text.apply(lambda x:encode(x))

print("Finished encoding")

Finished encoding


#### save encoded data

In [32]:
def save(name,obj):
    pickleOut= open("dataset/pickled/"+name,"wb")
    pickle.dump(obj,pickleOut)
    pickleOut.close()
    
save("train_chr_encoded",trein_encoded)
save("train_clean_chr_encoded",trein_clean_encoded)

save("test_chr_encoded",test_encoded)
save("test_clean_chr_encoded",test_clean_encoded)

print("Finished pickling")

Finished pickling


### Decoding

In [12]:
# to load other data
# pickleIn = open(path+name,"rb")
# obj = pickle.load(pickleIn)

In [31]:
def vec2Chr(lst):
    for x in range(len(lst)): 
        if lst[x] == 1: return chr(x)

def vecs2String(lst):
    string=""
    for i in lst:
        string+=vec2Chr(i)
    return string

print("Decode(train_encoded.text[0]) is: \n",vecs2String(train_encoded.text[0]))
print("=================")


do_assert = 1
if do_assert:
    assert(vec2Chr(chr2vec['a']) == 'a') 
    assert(vecs2String(train_encoded.text[0]) == train.text[0]) 
    
print("Finished decoding")

Decode(train_encoded.text[0]) is: 
 Spent the entire morning in a meeting w/ a vendor, and my boss was not happy w/ them. Lots of fun.  I had other plans for my morning
Finished decoding
