## **Librerías**

In [1]:
import os
import zipfile

In [4]:
from datetime import datetime

In [2]:
import pandas as pd

In [3]:
from bs4 import BeautifulSoup

## **Enviroment Variables**

In [16]:
data_path = '../data/'

## **Extract Data**

In [5]:
month = int(str(datetime.now()).replace('-', '')[:6])

In [None]:
original_path = f'{data_path}/raw/juanfeds-{month}.zip'

destination_path = f'{data_path}/processed/{month}'
os.makedirs(destination_path, exist_ok=True)

In [22]:
with zipfile.ZipFile(original_path, 'r') as zip_ref:
    zip_ref.extractall(destination_path)

## **Read Data**

In [43]:
def build_df(path: str, category: str) -> pd.DataFrame:
    

    # Read the HTML file
    with open(path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')
    data_raw = soup.select('div._a705 > main > div')
        
    data_list = []
    url_list = []

    for item in data_raw:
        final_item = item.get_text(separator=' ', strip=True)
        url = item.find('a')['href']
        
        data_list.append(final_item)
        url_list.append(url)

    user = [i.split(' ')[0] for i in data_list]
    year = [i.split(' ')[3] for i in data_list]
    month = [i.split(' ')[1] for i in data_list]
    day = [i.split(' ')[2].replace(',', '') for i in data_list]

    df_followers = pd.DataFrame({
        f'{category}_username': user,
        f'{category}_url': url_list,
        f'{category}_year': year,
        f'{category}_month': month,
        f'{category}_day': day
    })

    return df_followers


In [44]:
following_path = f'{destination_path}/connections/followers_and_following/following.html'
df_following = build_df(following_path, 'following')

In [45]:
followers_path = f'{destination_path}/connections/followers_and_following/followers_1.html'
df_followers = build_df(followers_path, 'followers')

In [52]:
def unfollowers(df_following: pd.DataFrame, df_followers: pd.DataFrame) -> pd.DataFrame:
    
    df_final = pd.merge(
        df_following,
        df_followers,
        how='outer',
        left_on='following_username',
        right_on='followers_username'
    )

    df_unfollowers = df_final[df_final['followers_username'].isna()][['following_username', 'following_url']].reset_index(drop=True)

    return df_unfollowers


In [54]:
df_unfollowers = unfollowers(df_following, df_followers)
df_unfollowers.to_csv(f'{data_path}/clean/unfollowers_{month}.csv', index=False)