By: Motlaq Almajhool

In [2]:
import pandas as pd
import json
from collections import Counter
from googletrans import Translator
from emoji import demojize
import re
import time

# Function:  Create Dataframe from Json

In [59]:
def scrape_info(data):

    types = []
    like_counts = []
    post_captions = []
    comments_counts = []
    idds = []
    usernames = []
    comments_content = []
    comment_user = []

    for image in data['GraphImages']:
        comments_folder = image['comments']['data']
        #try:
        if len(comments_folder) > 0:
            comments = []
            #n_comments = len(comments_folder)
            for i in range(len(comments_folder)):
                #comments.append(comments_folder[i]['text'])
                comments_content.append(comments_folder[i]['text'])
                comment_user.append(comments_folder[i]['owner']['username'])
                types.append(image['__typename'])
                like_counts.append(image['edge_media_preview_like']['count'])
                try:
                    post_captions.append(image['edge_media_to_caption']['edges'][0]['node']['text'])
                except:
                    post_captions.append('no caption')
                comments_counts.append(image['edge_media_to_comment']['count'])      
                idds.append(image['id'])
                usernames.append(image['username'])
        else:
            comments_content.append('no comment')
            types.append(image['__typename'])
            like_counts.append(image['edge_media_preview_like']['count'])
            try:
                post_captions.append(image['edge_media_to_caption']['edges'][0]['node']['text'])
            except:
                post_captions.append('no caption')
            comments_counts.append(image['edge_media_to_comment']['count'])      
            idds.append(image['id'])
            usernames.append(image['username'])




    df = pd.DataFrame([usernames, post_captions, like_counts, comments_counts, comments_content, comment_user, types, idds])
    columns = ['usernames', 'post_captions', 'like_counts', 'comments_counts', 'comments_content', 'comment_user', 'types', 'id']
    df = df.T
    df.columns = columns
    return df


# Detect and count posts' languages

In [60]:

def count_posts_langs(df):
    arabic = 0
    english = 0
    words_language = []
    post_language = []
    emoji_maybe = 0
    dfposts = df.drop_duplicates(subset=['id'])
    for post in dfposts['post_captions']:
        #print(post)
        post = demojize(post)
        post = re.sub(r':.+?:', '', post)
        words = []
        for word in post.split(' '):
            word = re.sub(r'(#[A-Za-z0-9_.]+)', '', word)
            words.append(word)
        post = ' '.join(word for word in words)
        translator = Translator()
        detect_lang = translator.detect(post)
        #time.sleep(15)
        if detect_lang.confidence == 0.0 and detect_lang.lang == 'en':
            post_language.append('Emjoi or No Caption')
        else:
            post_language.append(detect_lang.lang)
    posts_languages_count = Counter(post_language)
#    arabic_posts_percent = (posts_languages_count['ar']/len(dfposts))*100

    emoji_posts = posts_languages_count['Emjoi or No Caption']
    del posts_languages_count['Emjoi or No Caption']
    sumlang = sum(posts_languages_count.values())
    ar_percent = (posts_languages_count['ar']/sumlang)*100
    
    
    return ar_percent, posts_languages_count, emoji_posts

# Detect and count comments' languages

In [61]:
def count_comments_langs(df):
    from emoji import demojize
    words_language = []
    post_language = []
    
    for comment in df.sample(200)['comments_content']:
        comment = demojize(comment)
        comment = re.sub(r'@([A-Za-z0-9_.]+)', '', comment)
        comment = re.sub(r':.+?:', '', comment)
        #comment = re.sub(r'[-$()\"#/@;:<>{&*}`+=~^|.!?,]', '', comment)
        comment = re.sub(r'[0-9]', '',comment)
        comment = re.sub(r'[٠-٩]', '',comment)
        translator = Translator()
        detect_lang = translator.detect(comment)
        if detect_lang.confidence == 0.0 and detect_lang.lang == 'en':
            post_language.append('Emjoi or No Caption')
        else:
            post_language.append(detect_lang.lang)
#         print("---------")
#         print("Comment:")
#         print(comment)
#         print(detect_lang.lang)
#         print(detect_lang.confidence)
    comments_languages_count = Counter(post_language)
    emoji_comments = comments_languages_count['Emjoi or No Caption']
    del comments_languages_count['Emjoi or No Caption']
    sumlang = sum(comments_languages_count.values())
    ar_percent = (comments_languages_count['ar']/sumlang)*100
    
    #arabic_comments_percent = (comments_languages_count['ar']/len(df.sample(200)['comments_content']))*100
    return ar_percent, comments_languages_count, emoji_comments

# Read Json

In [54]:
with open('/Users/motlaq/ig2/tobysestatekw.json') as f:
    data = json.load(f)

In [122]:
df = scrape_info(data)

In [123]:
languages_count, posts_languages, emoji_posts = count_posts_langs(df) 

In [3]:
time.sleep(60) #Google Cloud API, you suck!

In [125]:
ar_comments, comments_languages, emoji_comments = count_comments_langs(df)

In [126]:
print(languages_count) 
print(ar_comments)



40.476190476190474
66.22516556291392


In [127]:
print(posts_languages)
print(comments_languages)
print(emoji_posts)
print(emoji_comments)


Counter({'en': 50, 'ar': 34})
Counter({'ar': 100, 'en': 45, 'sd': 3, 'arfa': 1, 'ceb': 1, 'ja': 1})
0
49


---

#### The code below is similar the functions we used already. Only difference is that this code returns a dataframe with columns that show the language of each observation (post/comment). Maybe I should've used this code in the first place. Anyway, This code was used in the notebook 'Analysing specific coffeeshops' which I used to get analyse specific coffeeshops (The last part of the article where I talked about.) 

---

In [62]:
def detect_post(post):
    post = demojize(post)
    post = re.sub(r':.+?:', '', post)
    words = []
    for word in post.split(' '):
        word = re.sub(r'(#[A-Za-z0-9_.]+)', '', word)
        words.append(word)
    post = ' '.join(word for word in words)
    translator = Translator()
    detect_lang = translator.detect(post)
        #time.sleep(15)
    if detect_lang.confidence == 0.0 and detect_lang.lang == 'en':
        lang = 'Emoji or No Caption'
    else:
        lang = detect_lang.lang
    return lang
    

In [63]:
def detect_comment(comment):
    comment = demojize(comment)
    comment = re.sub(r'@([A-Za-z0-9_.]+)', '', comment)
    comment = re.sub(r':.+?:', '', comment)
    #comment = re.sub(r'[-$()\"#/@;:<>{&*}`+=~^|.!?,]', '', comment)
    comment = re.sub(r'[0-9]', '',comment)
    comment = re.sub(r'[٠-٩]', '',comment)
    translator = Translator()
    detect_lang = translator.detect(comment)
    if detect_lang.confidence == 0.0 and detect_lang.lang == 'en':
        lang = 'Emjoi or No Caption'
    else:
        lang = detect_lang.lang
    return lang

In [64]:
with open('/Users/motlaq/ig2/tobysestatekw.json') as f:
    data = json.load(f)

In [65]:
def createdf_withlang(data):
    df = scrape_info(data)
    postlang = []
    cmntlang = []
    for row in df.itertuples():
        post = row.post_captions
        cmnt = row.comments_content
        #index = row.Index
        post_lang = detect_post(post)
        time.sleep(5)
        cmnt_lang = detect_comment(cmnt)
        postlang.append(post_lang)
        cmntlang.append(cmnt_lang)
    df['post_lang'] = postlang
    df['comment_lang'] = cmntlang
    return df


In [66]:
df = createdf_withlang(data)

In [68]:
df.to_csv('tobys.csv', index=False)