In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import re
from difflib import SequenceMatcher

In [None]:
y_june = (
    pd.read_csv('DATA/youtubejune2022.csv')
    .assign(Month='June')
    .rename(columns={
        "channel name": "username",
        "youTuber":"name",
        "Category": "category1",
        "Category_2": "category2",
        "Subscribers count": "subs",
        "Country": "country",
        "Views avg.": "views",
        "Likes avg": "likes",
        "Comments avg.": "comments",
        "Month": "month"
    })
    .dropna()
)
y_june.head(1)

In [None]:
y_sep = (
    pd.read_csv('DATA/Youtubesep2022.csv')
    .assign(Month='September')
    .rename(columns={
        "Name": "username",
        "Youtuber":"name",
        "Category_2": "category2",
        " Subscribers": "subs",
        "Country": "country",
        "Avg. views\r\n": "views",
        "Avg. likes": "likes",
        "Avg Comments": "comments",
        "Category_3": "category3",
        "Month": "month"
    })
    .drop("S.no", axis=1)
    .dropna()
)
y_sep.head(1)

In [None]:
y_nov = (
    pd.read_csv('DATA/youtubenov2022.csv')
    .assign(Month='November')
    .rename(columns={
        "Youtube channel": "username",
        "youtuber name":"name",
        "Category": "category1",
        "Followers": "subs",
        "Country": "country",
        "\nViews (Avg.)": "views",
        "Likes (Avg.)": "likes",
        "Comments (Avg.)": "comments",
        "Category-2": "category2",
        "Month": "month"
    })
    .drop("s.no", axis=1)
    .dropna()
)
y_nov.head(1)

In [None]:
y_dec= pd.read_csv('DATA/YOUTUBEDEC2022.csv')
y_dec = (
    pd.read_csv('DATA/YOUTUBEDEC2022.csv')
    .assign(Month='December')
    .rename(columns={
        "Youtube channel": "username",
        "youtuber name":"name",
        "Category": "category1",
        "Followers": "subs",
        "Country": "country",
        "\nViews (Avg.)": "views",
        "Likes (Avg.)": "likes",
        "Comments (Avg.)": "comments",
        "Category-2": "category2",
        "Month": "month"
    })
    .drop("s.no", axis=1)
    .dropna()
)
y_dec.head(1)

In [None]:
numeric_text_cols = ['subs', 'views', 'likes', 'comments']

def delete_rows_with_value(df, value):
    df.replace(value, np.nan, inplace=True)
    return df

y_june = delete_rows_with_value(y_june, "N/A'")
y_june.head(1)

In [None]:
def cientific_int(column: pd.Series) -> pd.Series:
    return (
        column
        .fillna('0')
        .replace({'K': '*1e3', 'M': '*1e6', 'G':'*1e9'}, regex=True)
        .map(pd.eval)
        .astype(int)
        .replace(0, np.nan)
    )

y_june.loc[:, numeric_text_cols] = y_june[numeric_text_cols].apply(cientific_int)
y_sep.loc[:, numeric_text_cols] = y_sep[numeric_text_cols].apply(cientific_int)
y_nov.loc[:, numeric_text_cols] = y_nov[numeric_text_cols].apply(cientific_int)
y_dec.loc[:, numeric_text_cols] = y_dec[numeric_text_cols].apply(cientific_int)

In [None]:
y_months = (
    pd.concat([y_june, y_sep, y_nov, y_dec], ignore_index=True)
    .drop_duplicates()
)

y_months

# Análisis

## 1. Número total de cuentas distintas

In [None]:
y_months.username.drop_duplicates().size

## 2. Promedio de seguidores, views, likes y comments

In [None]:
y_means = (
    y_months
    .groupby('username')
     # aggregate functiosn skips nan by deffault
    .aggregate({
        'subs': 'mean',
        'views': 'mean',
        'likes': 'mean',
        'comments': 'mean'
    })    
    .reset_index()
    .sort_values('subs', ascending=False)
)

In [None]:
y_means

## 3. Métricas de valor de cada cuenta

### Métricas de valor

In [None]:
y_means['views_ratio'] = y_means.views / y_means.subs
y_means['likes_ratio'] = y_means.likes / y_means.subs
y_means['comments_ratio'] = y_means.comments / y_means.subs

In [None]:
y_means

## 4. Histograma del número de cuentas que inicien por cada letra del alfabeto

### Gráficas

In [None]:
def get_inicial(name: str) -> str:
    name = str(name)
    first_is_letter = re.search(r'^[a-zA-Z]', name)
    if first_is_letter:
        return name[0].upper()
    else:
        return 'other'

initials = (
    y_months.name
    .map(get_inicial)
    .value_counts()
    .reset_index(name='frequency')
    .rename(columns={
        'index': 'letter',
        'name': 'letter'
    })
)
plt.bar(initials.letter, initials.frequency)
plt.show()

## 5. Cuántas cuentas tienen ya sea en su username o name las letras de las iniciales de cada uno de lo sintegrantes?

In [None]:
(y_months.name.str.contains(r'[GgIiMm]') | y_months.name.str.contains(r'[GgIiMm]')).sum()

## 6. Defina una métrica que identifique las 10 cuentas más importantes de youtube

In [None]:
y_means = (
    y_means
    .assign(
        engagement_size = (y_means.views / y_means.subs) * (y_means.subs / max(y_means.subs))
    )
    .sort_values(['views'], ascending=False)
)
y_means[['username', 'views']].head(10)

## 7. Grafique un histograma del número de seguidores por país

In [None]:
y_means.columns

In [None]:
print("y_means columns:", y_means.columns)
print("y_months columns:", y_months.columns)

y_country_means = (
    y_means
    .merge(y_months, on='username', how='left')
)

print("After merge columns:", y_country_means.columns)

y_country_means = y_country_means[['username', 'country', 'subs_x']].groupby('country').aggregate({'subs_x': 'mean'}).reset_index()

print("After groupby and aggregate columns:", y_country_means.columns)

y_country_means = y_country_means.rename(columns={'subs_x': 'mean_subs'}).sort_values('mean_subs', ascending=False)

print("After rename and sort_values columns:", y_country_means.columns)

# Set the figure size
plt.figure(figsize=(10, 6))

plt.bar(y_country_means['country'], y_country_means['mean_subs'])
plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
plt.xlabel('Country')
plt.ylabel('Mean Subscribers')
plt.title('Mean Subscribers by Country')

plt.show()

## 8. Crea una serie con todas las categorías existentes en todos los archivos de youtube

In [None]:
y_categories = (y_months.category1 + y_months.category2).dropna().drop_duplicates().reset_index(name='category')
y_categories = list(y_categories.category)

In [None]:
category_re = r'[A-Z][a-z\s]+[a-z]$'
res = set()
for element in y_categories:
    has_categories = re.findall(category_re, str(element))
    if has_categories:
        res = res.union(set(has_categories))

y_categories = pd.Series(list(res))
y_categories

## 9. Crea una serie con todos los países existente en todos los archivos de youtube

In [None]:
all_countries = y_months.country.dropna().drop_duplicates()
all_countries