# Foodvocate: Instagram data cleaning
Author: [Meng-Chieh Liu](https://github.com/MengChiehLiu)  
Date: 2023/5/22

In [41]:
import pandas as pd
import os
from collections import defaultdict
from tqdm import tqdm

## Filter

In [42]:
paths = os.listdir('data/posts')
bloggers_filter = defaultdict(int)

def clean_bloggers(path):
    df = pd.read_csv('data/posts/'+path)
    df = df[df['like_count']>=100]
    for user in df['username']:
        bloggers_filter[user] += 1

In [43]:
train_data = []

for path in tqdm(paths):
    clean_bloggers(path)

100%|██████████| 108/108 [00:00<00:00, 294.00it/s]


In [44]:
# before filtering
len(pd.read_csv('data/bloggers.csv', index_col='Unnamed: 0'))

3409

In [45]:
bloggers_df = pd.read_csv('data/bloggers.csv', index_col='Unnamed: 0')
bloggers_df['frequency'] = bloggers_df.index.map(lambda x: bloggers_filter.get(x, 0))
bloggers_df = bloggers_df[bloggers_df['frequency']>1]
bloggers_df.to_csv('data/new_bloggers.csv')

In [46]:
# after filtering
len(bloggers_df.index)

578

* After filtering, run get_fans.py first.

## datasets

In [47]:
def reformat(path):
    df = pd.read_csv('data/posts/'+path)
    df = df[df['username'].map(lambda x: x in bloggers_df.index)]
    df = df.sort_values('taken_at')[['username','taken_at']]
    return list(zip(*map(df.get, df)))

In [48]:
datasets = []
for path in tqdm(paths):
    data = reformat(path)
    if len(data) > 10:
        datasets.append(data)

100%|██████████| 108/108 [00:00<00:00, 267.36it/s]


* write to Instagram.data.datasets.py

## bloggers

In [49]:
bloggers = {i:blogger for i, blogger in enumerate(bloggers_df.index)}

* write to Instagram.data.datasets.py

## fans_count

In [54]:
fans = pd.read_csv('data/new_bloggers_with_fans.csv', index_col='Unnamed: 0')['fans'].to_dict()

In [56]:
fans_count = {}
for i, blogger in bloggers.items():
    fans_count[i] = fans[blogger]

* write to Instagram.data.datasets.py

## frequency_count

In [63]:
frequency_count = bloggers_df['frequency'].reset_index(drop=True).to_dict()

* write to Instagram.data.datasets.py