# Project TT Instagram Database
- This notebook will show how we create a bunch of instagram posts into a database

In [None]:
# don't worry about these
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
sys.path.append('..')
from time import sleep

from igramscraper.instagram import Instagram
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
%matplotlib inline  

# import previously defined functions
from core.utils import get_thumbnail, show_thumbnail, imresize
from core.instagram import get_media_by_url
from core.envs import DATA_DIR, IMAGE_DIR, THUMBNAIL_DIR

In [None]:
# login
instagram = Instagram(sleep_between_requests=15)
instagram.with_credentials('username', 'password', 'cache')
instagram.login()

Our MVP spec says:
"We will store our data as a [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) file. We will start with fixed data"

Eventually we want a proper database. Options are:
- SQL,
- NoSQL,
- GraphQL, 
- etc..,

But we don't want to worry about that now.

Let's create a dataset with 1000 images!

In [None]:
medias = instagram.get_medias_by_tag('newyork', count=1000)  # this will take some time!

Now that they are scraped, lets save them in a database!
- We will create pandas.Dataframe
- We will then use pandas functionality to generate a csv file

Let's first revise what attributes to save (note that this is a list of limited media content which doesn't have the lcoation tag...

In [None]:
def media_to_row(media):
    """ Convert media object to a row in dataset, with default values. """
    if hasattr(media, 'thumbnail_src'):
        media.image_thumbnail_url = media.thumbnail_src
        
    # check if media has carousels extracted
    if not hasattr(media, 'carousel_ids'):
        media.carousel_ids = ''
        media.carousel_types = ''
        media.carousel_thumbnail_urls = ''
        media.carousel_image_highres_urls = ''

    row = [
        media.identifier,
        media.short_code,
        media.link,
        media.owner.identifier,
        media.owner.username,
        media.owner.full_name,
        media.type,
        media.created_time,
        media.likes_count,
        media.image_thumbnail_url,
        media.image_high_resolution_url,
        media.carousel_ids,
        media.carousel_types,
        media.carousel_thumbnail_urls,
        media.carousel_image_highres_urls,
        media.caption,
        media.comments_count,
        media.comments,
        media.location_id,  
        media.location_name,
        media.location_slug,
    ]
    return row

In [None]:
m = medias[0]

In [None]:
columns = [
    'media_id', 
    'media_code', 
    'media_link', 
    'user_id',
    'username',
    'user_full_name',
    'type', 
    'created_time',
    'likes_count', 
    'img_thumbnail_url', 
    'img_highres_url', 
    'carousel_ids',
    'carousel_types',
    'carousel_thumbnail_urls', 
    'carousel_highres_urls', 
    'caption',
    'comments_count',
    'comments',
    'location_id',
    'location_name',
    'location_slug',
]

sample_row = media_to_row(m)
for k, v in zip(columns, sample_row):
    print(f'[{k}]: {v}')

Now let's create data spreadsheet using pandas

In [None]:
rawdata = []
for m in medias:
    rawdata.append(media_to_row(m))

In [None]:
df = pd.DataFrame(rawdata, columns=columns)
df.head(3)

In [None]:
# save the data as csv
df.to_csv(os.path.join('..', DATA_DIR, 'newyork_20191124.csv'), quotechar="'")

Done! 

For fun... lets check what's the most liked photo in this 1000 posts

In [None]:
df_sorted = df.sort_values(by=['likes_count'], ascending=False).head(3)
for i in range(3):
    curr_data = df_sorted.iloc[i]
    url = curr_data.img_thumbnail_url
    best_thumbnail = get_thumbnail(url)
    show_thumbnail(best_thumbnail, f'#{i+1}: {curr_data.likes_count} likes')

... I don't get people!