In [1]:
import pandas as pd
import json
import pickle
import os

In [16]:
in_path = "/mnt/e/Data/selfies/face_api_json"

shortcodes_analysed_json = os.listdir(in_path)
shortcodes_analysed = list(map(lambda x: x[:-5], shortcodes_analysed_json))
data_list = []
for shortcode in shortcodes_analysed:
    with open(os.path.join(in_path, shortcode + ".json"), "r") as fin:
        data = json.load(fin)
        if len(data) > 0:
            # Selfie: Use the biggest rectangle as main data
            data[0]["shortcode"] = shortcode
            data[0]["num_faces"] = len(data)
            
            accessories = {
                "glasses": 0.0,
                "headwear": 0.0,
                "mask": 0.0
            }

            for item in data[0]["faceAttributes"]["accessories"]:
                accessories[item["type"]] = item["confidence"]
            
            data[0]["faceAttributes"]["accessories"] = accessories
            
            hair_colors = {}
            
            for item in data[0]["faceAttributes"]["hair"]["hairColor"]:
                hair_colors[item["color"]] = item["confidence"]
                
            data[0]["faceAttributes"]["hair"]["hairColor"] = hair_colors
            
            data_list.append(data[0])
            
df_face = pd.io.json.json_normalize(data_list, sep="_")

df_face = df_face.drop(columns="faceId")

In [34]:
df_face.to_csv(os.path.join("data", "face_api.csv"), index=False)

In [17]:
in_path = "/mnt/e/Data/selfies/instagram_json"

shortcodes_analysed_json = os.listdir(in_path)
shortcodes_analysed = list(map(lambda x: x[:-5], shortcodes_analysed_json))

data_list = []
for shortcode in shortcodes_analysed:
    with open(os.path.join(in_path, shortcode + ".json"), "r") as fin:
        data = json.load(fin)
            
        data_list.append(data["graphql"]["shortcode_media"])
            
df_instagram = pd.io.json.json_normalize(data_list, sep="_")

In [18]:
remove_cols = [
    "dash_info_is_dash_eligible",                    # Too little data
    "dash_info_number_of_qualities",                 # Too little data
    "dash_info_video_dash_manifest",                 # Too little data
    "display_resources",                             # display_url is enough
    "edge_media_preview_like_edges",                 # Same across whole column
    "edge_media_to_caption_edges",                   # Remove for simplicity
    "edge_media_to_comment_edges",                   # Remove for simplicity
    "edge_media_to_comment_page_info_end_cursor",    # Too litle data
    "edge_media_to_comment_page_info_has_next_page", # Remove for simplicity
    "edge_media_to_sponsor_user_edges",              # Same across whole column
    "edge_media_to_tagged_user_edges",               # Remove for simplicity
    "edge_sidecar_to_children_edges",                # We are only dealing with single images
    "edge_web_media_to_related_media_edges",         # Same across whole column
    "encoding_status",                               # Too little data
    "gating_info",                                   # Too little data
    "has_ranked_comments",                           # Same across whole column
    "id",                                            # Not needed
    "is_ad",                                         # Same across whole column
    "is_published",                                  # is_published == is_video??
    "is_video",                                      # We are only dealing with images
    "location",                                      # Too little data
    "location_address_json",                         # Negligible | Low data
    "location_has_public_page",
    "location_id",
    "location_name",
    "location_slug",
    "media_preview",                                 # Irrelevant
    "owner_blocked_by_viewer",                       # Same across whole column
    "owner_followed_by_viewer",                      # Same across whole column
    "owner_full_name",                               # We will user owner_usename
    "owner_has_blocked_viewer",                      # Sane acriss whole column
    "owner_is_private",                              # Same across whole column
    "owner_is_unpublished",                          # Same across whole column
    "owner_profile_pic_url",                         # Remove for simplicity
    "owner_requested_by_viewer",                     # Same across whole column
    "product_type",                                  # May be for videos only?
    "should_log_client_event",                       # Sane across whole column
    "thumbnail_src",                                 # Videos only
    "title",                                         # Too little data
    "tracking_token",                                # Not needed
    "video_duration",                                # Videos only
    "video_url",                                     # Videos only
    "video_view_count",
    "viewer_can_reshare",                            # Same across whole column
    "viewer_has_liked",
    "viewer_has_saved",
    "viewer_has_saved_to_collection",
    "viewer_in_photo_of_you"
]

df_instagram = df_instagram.drop(columns=remove_cols)

df_instagram = df_instagram[(df_instagram["__typename"] == "GraphImage") & (df_instagram["comments_disabled"] == False)].drop(columns=["__typename", "comments_disabled"])

In [33]:
df_instagram.to_csv(os.path.join("data", "instagram.csv"), index=False)

In [23]:
df_merge = df_instagram.merge(df_face, on="shortcode", how="inner")

In [32]:
df_merge[df_merge["edge_media_preview_like_count"] > 50].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 933 entries, 0 to 1640
Columns: 114 entries, accessibility_caption to num_faces
dtypes: bool(8), float64(83), int64(10), object(13)
memory usage: 787.2+ KB


In [27]:
df_merge.to_csv(os.path.join("data", "instagram.csv"), index=False)

0                             Image may contain: 1 person
1                             Image may contain: 1 person
2                             Image may contain: 1 person
3                         No photo description available.
4                         No photo description available.
5                             Image may contain: 1 person
6                             Image may contain: 2 people
7                             Image may contain: 3 people
8                             Image may contain: 2 people
9                             Image may contain: 1 person
10                            Image may contain: 1 person
11                            Image may contain: 1 person
12      Image may contain: 1 person, hat, tree, outdoo...
13                            Image may contain: 1 person
14                            Image may contain: 1 person
15                   Image may contain: 1 person, closeup
16      Image may contain: 2 people, people smiling, c...
17            