### Import Module

In [None]:
import pandas as pd
import numpy as np

### Prepare columns of interest and specify their types

In [None]:
# Init constant var to store desired cols and their types
DTYPE_DICT = {
    "user_id": pd.Int64Dtype(),
    "status": pd.StringDtype(),
    "timestamp": pd.Int64Dtype(),
    "data.user_data.meta.is_private": pd.BooleanDtype(),
    "data.user_data.meta.is_verified": pd.BooleanDtype(),
    "data.user_data.meta.is_business_account": pd.BooleanDtype(),
    "data.user_data.meta.is_professional_account": pd.BooleanDtype(),
    "data.user_data.meta.has_reel": pd.BooleanDtype(),
    "data.user_data.meta.total_posts_count": pd.Int64Dtype(),
    "data.user_data.meta.followers_count": pd.Int64Dtype(),
    "data.user_data.meta.followings_count": pd.Int64Dtype(),
}

# Iteratively get the metrics for posts
post_column_pattern = "data.user_data.post.{}.{}"
post_metrics = {
    "is_video": pd.BooleanDtype(),
    "video_view_count": pd.Int64Dtype(),
    "comments_count": pd.Int64Dtype(),
    "comments_disabled": pd.BooleanDtype(),
    "taken_at_timestamp": pd.Int64Dtype(),
    "likes_count": pd.Int64Dtype(),
    "is_pinned": pd.BooleanDtype(),
}

# Populate DTYPE_DICT with desired metrics for 12 posts
for i in range(1, 13):
    for metric, dtype in post_metrics.items():
        column_name = post_column_pattern.format(i, metric)
        DTYPE_DICT[column_name] = dtype

# Iteratively get the metrics for reels
reel_column_pattern = "data.user_data.reel.{}.{}"
reel_metrics = {
    "has_audio": pd.BooleanDtype(),
    "video_view_count": pd.Int64Dtype(),
    "comments_count": pd.Int64Dtype(),
    "comments_disabled": pd.BooleanDtype(),
    "taken_at_timestamp": pd.Int64Dtype(),
    "likes_count": pd.Int64Dtype(),
    "video_duration": pd.Float64Dtype(),
}
        
# Populate DTYPE_DICT with desired metrics for 36 reels
for i in range(1, 37):
    for metric, dtype in reel_metrics.items():
        column_name = reel_column_pattern.format(i, metric)
        DTYPE_DICT[column_name] = dtype

# Get a list of desired columns (for `read_csv()`)
columns_to_read = list(DTYPE_DICT.keys())

### Read Data

In [None]:
data = pd.read_csv('.././data/input/instagram.csv',
                  encoding='utf-8',
                  index_col=False,
                  usecols=columns_to_read,
                  dtype=DTYPE_DICT,)

Filter IG account with status success, and has reels. No private accounts

In [None]:
data = data.loc[(data['status'] == 'successful') & (data['data.user_data.meta.has_reel'] == True) & (data['data.user_data.meta.is_private'] == False)]

#filter out the post data, leave only reels and useful columns
data = data.loc[:, ~data.columns.str.contains('data.user_data.post')]
data = data.drop(['status', 'data.user_data.meta.is_private', 'data.user_data.meta.is_verified', 'data.user_data.meta.is_business_account',	'data.user_data.meta.is_professional_account',	'data.user_data.meta.has_reel',	'data.user_data.meta.total_posts_count'
], axis=1)


Melt the dataframe for each row to contain each reels data

In [None]:
melted = data.melt(id_vars=['user_id', 'data.user_data.meta.followers_count', 'data.user_data.meta.followings_count', 'timestamp'], 
                 var_name='variable', 
                 value_name='value')
melted[['reels no.', 'field']] = melted['variable'].str.extract(r'reel\.(\d+)\.(.+)')
melted.sort_values(['user_id'])

stacked = melted.pivot(index=['user_id', 'reels no.', 'data.user_data.meta.followers_count', 'data.user_data.meta.followings_count', 'timestamp'], columns='field', values='value').reset_index()
cleaned_data = stacked.rename(columns={'data.user_data.meta.followers_count': 'followers', 'data.user_data.meta.followings_count':'followings'})
cleaned_data = cleaned_data.dropna()


Look at timestamp to filter out some data samples that are too old or too recent

In [None]:
cleaned_data['posted_date'] = pd.to_datetime(cleaned_data['taken_at_timestamp'], unit='s')
cleaned_data['timestamp'] = pd.to_datetime(cleaned_data['timestamp'], unit='s')
cleaned_data['reel_age'] = cleaned_data['timestamp'] - cleaned_data['posted_date']
print("Total reels", cleaned_data['reel_age'].shape[0])
print("Reels less than 7 days old: ", cleaned_data[cleaned_data['reel_age'].dt.days < 7].shape[0])
print("Reels 7-30 days old: ",cleaned_data[(cleaned_data['reel_age'].dt.days >= 7) & (cleaned_data['reel_age'].dt.days <= 30)].shape[0])
print("Reels 1-3 months: ", cleaned_data[(cleaned_data['reel_age'].dt.days >= 31) & (cleaned_data['reel_age'].dt.days <= 90)].shape[0])
print("Reels 3-12 months: ", cleaned_data[(cleaned_data['reel_age'].dt.days >= 91) & (cleaned_data['reel_age'].dt.days <= 365)].shape[0])
print("Reels 1-2 years: ", cleaned_data[(cleaned_data['reel_age'].dt.days > 365) & (cleaned_data['reel_age'].dt.days <= 365 * 2)].shape[0])
print("Reels > 2 years: ", cleaned_data[(cleaned_data['reel_age'].dt.days > 365 * 2)].shape[0])

Take out reels that are less than 7 days old

In [None]:
cleaned_data = cleaned_data[cleaned_data['reel_age'].dt.days >= 7]
cleaned_data = cleaned_data.drop(['taken_at_timestamp'], axis=1)
cleaned_data