## Overview

This notebook provides some simple tools to extract the top occuring key words from comments.

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
import unicodedata as ud
import string 
from nltk.corpus import stopwords
import nltk

%matplotlib inline

### Helper functions and utilities

In [2]:
def is_username(text):
    if re.match("@[A-Za-z0-9_.]+",text):
        return True
    return False
    
def is_hashtag(text):
    if re.match("#[A-Za-z0-9_.]+",text):
        return True
    return False

## regular expression for finding unicode
unicode_re = re.compile(u'['
    u'\U0001F300-\U0001F64F'
    u'\U0001F680-\U0001F6FF'
    u'\u2600-\u26FF\u2700-\u27BF]+', 
    re.UNICODE)

skintone_re = re.compile(u'[\U0001f3fb-\U0001f3ff]+', re.UNICODE)

def strip_unicode(text):
    return unicode_re.sub('',text)

In [3]:
# Defining the stopword list. Nltk, with multiple langs.
# plus additional words unique to this corpus.

stops = []
languages = ['english','spanish','french']
for language in languages:
    stops += stopwords.words(language)
    
stops += list(string.ascii_letters)

username = 'lorealmakeup'    
additional_stopwords = ['',' ',username]
stops += additional_stopwords

In [4]:
# To remove punction from comments.
translator = str.maketrans('', '', string.punctuation)

### Data Imports and Cleaning

In [5]:
post_df = pd.read_csv('loreal_comments.csv').sort_values(by=['created_at'])

In [6]:
# Turning list of comments into the list of all words
comments = list(post_df.text)
comments_split = [comment.split() for comment in comments]
# Flattening list, removing puncs, lowercasing all words, and removing emojis and usernames. 
words = [strip_unicode(item).lower().translate(translator).strip() for sublist in comments_split for item in sublist if not is_username(item)]
# Removing stopwords.
clean_words = [word for word in words if word not in stops and len(word)>=2]

### Key words

In [7]:
pd.Series(clean_words).value_counts().head(15)

love         19
shade        13
colour       10
color        10
beautiful    10
jadore        5
perfect       5
need          4
lovely        4
pretty        4
couleur       4
nice          4
like          4
wow           4
gadhi         3
dtype: int64

### Hash tags

In [8]:
hash_tags = [item.lower() for sublist in comments_split for item in sublist if is_hashtag(item)]

In [9]:
pd.Series(hash_tags).value_counts().head(15)

#lovecampaign    1
#euquero         1
#simplissime     1
#want            1
#loveit❤️        1
#efficace        1
dtype: int64

### Emojis

In [10]:
## Grabbing just the comments
comments = post_df.text
## Stripping out only the emojis in the string
emoji_strings = [''.join(re.findall(unicode_re,comment)) for comment in comments]
## Spliting into individual emojis, removing all skin tone emojis
emojis = [emoji for emojis in emoji_strings for emoji in emojis if not re.match(skintone_re,emoji)]

In [11]:
pd.Series(emojis).value_counts().head(10)

😍    108
❤     27
👌     14
💗      9
👍      8
💓      8
😭      7
👏      6
💄      6
🌸      6
dtype: int64