# Preliminary Playaround

## Loading data

### Prepare columns of interest and specify their types

In [25]:
import pandas as pd

# Init constant var to store desired cols and their types
DTYPE_DICT = {
    "user_id": pd.Int64Dtype(),
    "status": pd.StringDtype(),
    "timestamp": pd.Int64Dtype(),
    "data.user_data.meta.is_private": pd.BooleanDtype(),
    "data.user_data.meta.is_verified": pd.BooleanDtype(),
    "data.user_data.meta.is_business_account": pd.BooleanDtype(),
    "data.user_data.meta.is_professional_account": pd.BooleanDtype(),
    "data.user_data.meta.has_reel": pd.BooleanDtype(),
    "data.user_data.meta.total_posts_count": pd.Int64Dtype(),
    "data.user_data.meta.followers_count": pd.Int64Dtype(),
    "data.user_data.meta.followings_count": pd.Int64Dtype(),
}

# Iteratively get the metrics for posts
post_column_pattern = "data.user_data.post.{}.{}"
post_metrics = {
    "is_video": pd.BooleanDtype(),
    "video_view_count": pd.Int64Dtype(),
    "comments_count": pd.Int64Dtype(),
    "comments_disabled": pd.BooleanDtype(),
    "taken_at_timestamp": pd.Int64Dtype(),
    "likes_count": pd.Int64Dtype(),
    "is_pinned": pd.BooleanDtype(),
}

# Populate DTYPE_DICT with desired metrics for 12 posts
for i in range(1, 13):
    for metric, dtype in post_metrics.items():
        column_name = post_column_pattern.format(i, metric)
        DTYPE_DICT[column_name] = dtype

# Iteratively get the metrics for reels
reel_column_pattern = "data.user_data.reel.{}.{}"
reel_metrics = {
    "has_audio": pd.BooleanDtype(),
    "video_view_count": pd.Int64Dtype(),
    "comments_count": pd.Int64Dtype(),
    "comments_disabled": pd.BooleanDtype(),
    "taken_at_timestamp": pd.Int64Dtype(),
    "likes_count": pd.Int64Dtype(),
    "video_duration": pd.Float64Dtype(),
}
        
# Populate DTYPE_DICT with desired metrics for 36 reels
for i in range(1, 37):
    for metric, dtype in reel_metrics.items():
        column_name = reel_column_pattern.format(i, metric)
        DTYPE_DICT[column_name] = dtype

# Get a list of desired columns (for `read_csv()`)
columns_to_read = list(DTYPE_DICT.keys())

# Preview
# TODO: Later may copy the dict output directly into the constant module, or to make reproducible, keep the above code as is
DTYPE_DICT

{'user_id': Int64Dtype(),
 'status': string[python],
 'timestamp': Int64Dtype(),
 'data.user_data.meta.is_private': BooleanDtype,
 'data.user_data.meta.is_verified': BooleanDtype,
 'data.user_data.meta.is_business_account': BooleanDtype,
 'data.user_data.meta.is_professional_account': BooleanDtype,
 'data.user_data.meta.has_reel': BooleanDtype,
 'data.user_data.meta.total_posts_count': Int64Dtype(),
 'data.user_data.meta.followers_count': Int64Dtype(),
 'data.user_data.meta.followings_count': Int64Dtype(),
 'data.user_data.post.1.is_video': BooleanDtype,
 'data.user_data.post.1.video_view_count': Int64Dtype(),
 'data.user_data.post.1.comments_count': Int64Dtype(),
 'data.user_data.post.1.comments_disabled': BooleanDtype,
 'data.user_data.post.1.taken_at_timestamp': Int64Dtype(),
 'data.user_data.post.1.likes_count': Int64Dtype(),
 'data.user_data.post.1.is_pinned': BooleanDtype,
 'data.user_data.post.2.is_video': BooleanDtype,
 'data.user_data.post.2.video_view_count': Int64Dtype(),
 '

### Read and check

In [None]:
# Read the data
df = pd.read_csv(
    '.././data/input/instagram.csv',
    encoding='utf-8',
    index_col=False,
    usecols=columns_to_read,
    dtype=DTYPE_DICT,
)

# Get infos
print(df.info())

# Check types
print(df.dtypes)

# Check all cols for the first 10 rows
with pd.option_context('display.max_columns', None):
    print(df.head(n=10))

# One strange value found during playaround
# This timestamp was as scientific notation in the raw csv, decide what to do about it later
# df[df["user_id"] == 1098748]["data.user_data.reel.13.taken_at_timestamp"].values

<IntegerArray>
[1719000000]
Length: 1, dtype: Int64

### Optional: Creating a subset for faster manipulations

In [28]:
# Initialize an empty DataFrame to collect the first 100 rows
subset_df = df.head(100)

# Save the subset if needed
subset_df.to_csv('.././data/input/instagram_subset.csv', index=False)