# Data Cleaning and Preprocessing
Calvin Dailey, Keenen Cates, Zeyu Zhang

In [None]:
import numpy as np
import pandas as pd
from requests import get
from os import path
from io import BytesIO
from zipfile import ZipFile
from emoji_list import all_emoji
from time import strftime
from urllib.request import urlopen 
from collections import Counter
from string import punctuation
import pickle

excluded = ['1','2','3','4', 
            '5','6','7','8', 
            '9','0','#','*']

emojis = [e for e in all_emoji if e not in excluded]

today = strftime('%x').replace('/', '_')

data_url = 'https://www.kaggle.com/datasnaek/youtube/downloads/youtube.zip'
root_dir = '.'
data_root = path.join(root_dir, 'data')
data_path = path.join(data_root, 'data_' + today + '/')
test_data = data_root

In [None]:
def get_comments(path):
    df = pd.read_csv(path + '/UScomments.csv', error_bad_lines=False)
    comments = df['comment_text']
    
    words = Counter()
    for each in comments:
        words.update(''.join([c for c in str(each).lower() if c not in punctuation]).split(' '))
    return comments

In [None]:
def split_emoji_data(comments, threshold):
    has_emoji = lambda s: any((True for x in [c for c in str(s)] if x in emojis))   
    emoji_comments = []
    no_emoji_comments = []
    for each in comments[:threshold]:
        if has_emoji(each):
            emoji_comments.append(each)
        else:
            no_emoji_comments.append(each)
            
    return emoji_comments, no_emoji_comments

In [None]:
def extract_emoji_target(emoji_comments):
    inputs = [] 
    targets = []
    for comment in emoji_comments:
        x = []
        y = []
        for c in comment:
            if c in emojis:
                y.append(c)
            else:
                x.append(c)
        inputs.append(''.join(x))
        targets.append(''.join(set(y)))
    return inputs, targets

In [None]:
def token_lookup():
    lookup = {}
    lookup['!'] = '<EXCLAMATION_MARK>'
    lookup['"'] = '<QOUTATION_MARK>'
    lookup['('] = '<LEFT_PARANTHESES>'
    lookup[')'] = '<RIGHT_PARANTHESES>'
    lookup[','] = '<COMMA_SIGN>'
    lookup['.'] = '<PERIOD>'
    lookup['--'] = '<DASH>'
    lookup[';'] = '<SEMICOLON>'
    lookup['?'] = '<QUESTION_MARK>'
    lookup['\\n'] ='<RETURN>'
    return lookup

In [None]:
def create_lookup_tables(text):
    word_counts = Counter()
    for comment in text:
        word_counts.update(comment)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    return vocab_to_int, int_to_vocab

In [None]:
def preprocess_and_save_data(inputs, targets):
    token_dict = token_lookup()
    pre_comments = []
    for comment in inputs:
        s = comment
        for key, token in token_dict.items():
            s = (s.replace(key, ' {} '.format(token)))
        pre_comments.append(s)
        
    comments_s = []
    comments_i = []
    
    targets_s = [list(each) for each in labels]
    targets_i = []
    
    for comment in pre_comments:
        comments_s.append(comment.lower().split(' ')[:-1])
        
    vocab_to_int, int_to_vocab = create_lookup_tables(comments_s)
    emoji_to_int, int_to_emoji = create_lookup_tables(targets_s)
    for comment in comments_s:
        int_text = [vocab_to_int[word] for word in comment]
        comments_i.append(int_text)
    for emoji in targets_s:
        int_emoji = [emoji_to_int[e] for e in emoji]
        targets_i.append(int_emoji)
    pickle.dump((comments_i, targets_i, emoji_to_int, int_to_emoji, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))
    
def load_preprocess():
    return pickle.load(open('preprocess.p', mode='rb'))

In [None]:
def get_data():
    comments = get_comments(test_data)
    emoji_comments, no_emoji_comments = split_emoji_data(comments, len(comments))
    inputs, labels = extract_emoji_target(emoji_comments)
    preprocess_and_save_data(inputs, labels)
    pickle.dump((no_emoji_comments), open('no_emojis.p', 'wb'))

In [None]:
get_data()