## Overview

This notebook utilizes the detect language python api to determine the language of the comment.

### Imports

In [None]:
import json
import pandas as pd
import numpy as np
import detectlanguage
import config
import string
import re
from nltk.corpus import stopwords

%matplotlib inline

### Configs

In [None]:
detectlanguage.configuration.api_key = "your detectlanguage_api_key"

### Helpers

In [None]:
## Note that this emoji detector also throws out foriegn text
## e.g., 'Sabía'
def is_emoji(text):
    try:
        text.encode('ascii')
        return False
    except UnicodeEncodeError:
        return True

def is_username(text):
    if re.match("@[A-Za-z0-9_.]+",text):
        return True
    return False
    
def is_hashtag(text):
    if re.match("#[A-Za-z0-9_.]+",text):
        return True
    return False

def cleaned_text_for_langdect(word_list):
    return ' '.join([word.lower() for word in word_list 
                     if not is_emoji(word) 
                     and not is_username(word) 
                     and not is_hashtag(word)])

def get_language(text):
    lang_output = detectlanguage.detect(text)
    if lang_output:
        lang_dict = dict(lang_output[0])
        lang_dict['text'] = text
        return lang_dict
    else:
        return None

In [None]:
# Getting the detect language map
with open('detect_language_key.json','r') as input:
    language_key = json.load(input)

### Data Imports and Cleaning

In [None]:
post_df = pd.read_csv('loreal_comments.csv').sort_values(by=['created_at'])

In [None]:
## Prepping comments for language detection
comments = list(post_df.text)
comments_split = [comment.split() for comment in comments]
text_list = [cleaned_text_for_langdect(word_list) for word_list in comments_split]

### Detecting language in comments

In [None]:
## Analyzing each comment and adding to data list.
comment_languages_found = []
for text in text_list:
    lang_dict = get_language(text)
    if lang_dict:
        comment_languages_found.append(lang_dict)

In [None]:
## Screening for only languages with high confidence scores from algorithm.
reliable_langs_found = [lang_dict for lang_dict in comment_languages_found if lang_dict['isReliable'] == True]
languages_found = [lang_dict['language'] for lang_dict in reliable_langs_found]

In [None]:
langs_series = pd.Series([language_key[lang_initials] for lang_initials in languages_found]).value_counts()

### Interpreting results

In [None]:
langs_df = pd.DataFrame(langs_series)
langs_df.reset_index(level=0, inplace=True)
langs_df.columns = ['language','comment_count']

In [None]:
top4_df = langs_df.sort_values(by=['comment_count'],ascending=False).iloc[:4]
other_count = langs_df.iloc[4:]['comment_count'].sum()

In [None]:
top5_df = top4_df.append({'language':'Other','comment_count':other_count},ignore_index=True)
total = top5_df['comment_count'].sum()
top5_df['normalized'] = top5_df['comment_count']/float(total)

In [None]:
top5_df

In [None]:
x = np.array([54,3,2,2,5])

In [None]:
x/x.sum()