# Importación de librerías

In [4]:
import pandas as pd
import numpy as np
import re

# Data Viz
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

pd.set_option("display.max_columns", 20)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Funciones útiles

In [88]:
def freq_discrete(df, features):
    for feature in features:
        print(f"Feature: {feature}")
        abs_ = df[feature].value_counts(dropna=False).to_frame().rename(columns={"count": "Absolute frequency"})
        rel_ = df[feature].value_counts(dropna=False, normalize= True).to_frame().rename(columns={"proportion": "Relative frequency"})
        freq = abs_.join(rel_)
        freq["Accumulated frequency"] = freq["Absolute frequency"].cumsum()
        freq["Accumulated %"] = freq["Relative frequency"].cumsum()
        freq["Absolute frequency"] = freq["Absolute frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Relative frequency"] = freq["Relative frequency"].map(lambda x: "{:,.2%}".format(x))
        freq["Accumulated frequency"] = freq["Accumulated frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Accumulated %"] = freq["Accumulated %"].map(lambda x: "{:,.2%}".format(x))
        display(freq)

In [5]:
def make_specs( rows : int, cols : int, n : int) -> dict:
    """Makes the list of dictionaries to set the layout for the function make_subplots, especially for primes and specific grids.

    Args:
        rows (int): Number of rows for the grid
        cols (int): Number of columns for the grid
        n (int): Total number of figures

    Returns:
        dict: _description_
    """
    specs = []
    for i in range(rows):
        auxi = []
        for j in range(cols):
            if i*cols + j <= n:
                auxi.append({})
            else:
                auxi.append(None)
        specs.append(auxi)
    return specs

In [6]:
def hist_matrix(df : pd.DataFrame, columns : list = None, rows : int = None, cols : int = None) -> None:

    if columns == None:
        columns = df.columns
    
    n = len(columns)

    if rows == None:
        rows = int(n**0.5)
    if cols == None:
        cols = int(n/rows + 0.99)
    
    if rows*cols < n:
        cols = int(n/rows + 0.99)

    specs = make_specs(rows, cols, n)
    fig1 = make_subplots(rows = rows, cols = cols,
                        specs= specs, subplot_titles= columns)

    for i, col in enumerate(columns):
        row = i//cols +1
        colum = i%cols + 1
        fig1.add_trace(go.Histogram(x=df[col], name=col) , row=row, col=colum)

    fig1.update_layout(title_text='Histogramas', showlegend=False)
    fig1.show()

In [7]:
def hist_box(df : pd.DataFrame, columns : list = None, rows : int = None, cols : int = None) -> None:

    if columns == None:
        columns = df.columns
    
    n = len(columns)

    if rows == None:
        rows = int(n**0.5)
    if cols == None:
        cols = int(n/rows + 0.99)
    
    if rows*cols < n:
        cols = int(n/rows + 0.99)

    specs = make_specs(rows, cols, n)
    fig1 = make_subplots(rows = rows, cols = cols,
                        specs= specs, subplot_titles= columns)

    for i, col in enumerate(columns):
        row = i//cols +1
        colum = i%cols + 1
        fig1.add_trace(go.Box(y=df[col], name=col) , row=row, col=colum)

    fig1.update_layout(title_text='Histogramas', showlegend=False)
    fig1.show()

# Lectura de datos

In [None]:
ruta = '../TABLAS/'

In [33]:
df_animes = pd.read_csv(ruta+'ANIME.csv')
df_anime_relation = pd.read_csv(ruta+'ANIME_RELATION.csv')
df_classification = pd.read_csv(ruta+'CLASSIFICATION.csv')
df_genre_anime = pd.read_csv(ruta+'GENRE_ANIME.csv')
df_genre = pd.read_csv(ruta+'GENRE.csv')
df_source = pd.read_csv(ruta+'SOURCE.csv')
df_types = pd.read_csv(ruta+'TYPES.csv')

Unir dataframes

In [34]:
df_animes

Unnamed: 0,ANIME_ID,ANIME_NAME,EPISODES,URL_IMAGE,DURATION,SCORED,SCORED_BY,RANK,POPULARITY,FAVORITES,SYNOPSIS,STATUS,START_DATE,FINISH_DATE,ID_CLASSIFICATION,ID_SOURCE,ID_TYPE
0,32281,Kimi no Na wa.,1,https://myanimelist.cdn-dena.com/images/anime/...,1 hr. 46 min.,9.1900,471398,2.0000,33,34912,"Mitsuha Miyamizu, a high school girl, yearns t...",Finished Airing,2016-08-26,2016-08-26,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,64,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.2500,733592,1.0000,4,106895,After a horrific alchemy experiment goes wrong...,Finished Airing,2009-04-05,2010-07-04,1,1,1
2,28977,Gintama°,51,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.1500,71751,3.0000,383,5668,"Gintoki, Shinpachi, and Kagura return as the f...",Finished Airing,2015-04-08,2016-03-30,1,1,1
3,9253,Steins;Gate,24,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.1400,563857,5.0000,8,92423,Eccentric scientist Rintarou Okabe has a never...,Finished Airing,2011-04-06,2011-09-14,0,2,1
4,9969,Gintama&#039;,51,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.1100,92025,7.0000,342,4549,"After a one-year hiatus, Shinpachi Shimura ret...",Finished Airing,2011-04-04,2012-03-26,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12189,9316,Toushindai My Lover: Minami tai Mecha-Minami,1,https://myanimelist.cdn-dena.com/images/anime/...,30 min.,4.9400,68,,10297,0,A young man receives a life-size nude girl in ...,Finished Airing,2001-10-26,2001-10-26,5,9,2
12190,5543,Under World,1,https://myanimelist.cdn-dena.com/images/anime/...,1 hr. 4 min.,5.1000,62,,10456,0,Based on the cartoon by one of the greatest Ko...,Finished Airing,2000-11-18,2000-11-18,5,9,2
12191,5621,Violence Gekiga David no Hoshi,4,https://myanimelist.cdn-dena.com/images/anime/...,45 min. per ep.,5.1800,79,,10164,2,Tetsuya was the product of his mother being ra...,Finished Airing,1989-12-05,1990-06-05,5,9,2
12192,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,1,https://myanimelist.cdn-dena.com/images/anime/...,45 min.,5.3100,67,,10510,0,Based on the manga by Satou Masaaki,Finished Airing,1991-03-08,1991-03-08,5,9,2


In [35]:
df_animes = df_animes.merge(df_classification, 
                            on='ID_CLASSIFICATION', 
                            how='left')\
                            .drop('ID_CLASSIFICATION', axis = 1)

In [36]:
df_animes = df_animes.merge(df_source, 
                            on='ID_SOURCES', 
                            how='left')\
                            .drop('ID_SOURCES', axis = 1)

In [38]:
df_animes = df_animes.merge(df_types, 
                            on='ID_TYPES', 
                            how='left')\
                            .drop('ID_TYPES', axis = 1)

In [39]:
df_animes

Unnamed: 0,ANIME_ID,ANIME_NAME,EPISODES,URL_IMAGE,DURATION,SCORED,SCORED_BY,RANK,POPULARITY,FAVORITES,SYNOPSIS,STATUS,START_DATE,FINISH_DATE,CLASSIFICATION,SOURCE,TYPE
0,32281,Kimi no Na wa.,1,https://myanimelist.cdn-dena.com/images/anime/...,1 hr. 46 min.,9.1900,471398,2.0000,33,34912,"Mitsuha Miyamizu, a high school girl, yearns t...",Finished Airing,2016-08-26,2016-08-26,PG-13 - Teens 13 or older,Original,Movie
1,5114,Fullmetal Alchemist: Brotherhood,64,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.2500,733592,1.0000,4,106895,After a horrific alchemy experiment goes wrong...,Finished Airing,2009-04-05,2010-07-04,R - 17+ (violence & profanity),Manga,TV
2,28977,Gintama°,51,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.1500,71751,3.0000,383,5668,"Gintoki, Shinpachi, and Kagura return as the f...",Finished Airing,2015-04-08,2016-03-30,R - 17+ (violence & profanity),Manga,TV
3,9253,Steins;Gate,24,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.1400,563857,5.0000,8,92423,Eccentric scientist Rintarou Okabe has a never...,Finished Airing,2011-04-06,2011-09-14,PG-13 - Teens 13 or older,Visual novel,TV
4,9969,Gintama&#039;,51,https://myanimelist.cdn-dena.com/images/anime/...,24 min. per ep.,9.1100,92025,7.0000,342,4549,"After a one-year hiatus, Shinpachi Shimura ret...",Finished Airing,2011-04-04,2012-03-26,PG-13 - Teens 13 or older,Manga,TV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12189,9316,Toushindai My Lover: Minami tai Mecha-Minami,1,https://myanimelist.cdn-dena.com/images/anime/...,30 min.,4.9400,68,,10297,0,A young man receives a life-size nude girl in ...,Finished Airing,2001-10-26,2001-10-26,Rx - Hentai,Unknown,OVA
12190,5543,Under World,1,https://myanimelist.cdn-dena.com/images/anime/...,1 hr. 4 min.,5.1000,62,,10456,0,Based on the cartoon by one of the greatest Ko...,Finished Airing,2000-11-18,2000-11-18,Rx - Hentai,Unknown,OVA
12191,5621,Violence Gekiga David no Hoshi,4,https://myanimelist.cdn-dena.com/images/anime/...,45 min. per ep.,5.1800,79,,10164,2,Tetsuya was the product of his mother being ra...,Finished Airing,1989-12-05,1990-06-05,Rx - Hentai,Unknown,OVA
12192,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,1,https://myanimelist.cdn-dena.com/images/anime/...,45 min.,5.3100,67,,10510,0,Based on the manga by Satou Masaaki,Finished Airing,1991-03-08,1991-03-08,Rx - Hentai,Unknown,OVA


# EDA

## Animes

### Clasificación

In [40]:
df_animes.dtypes

ANIME_ID            int64
ANIME_NAME         object
EPISODES           object
URL_IMAGE          object
DURATION           object
SCORED            float64
SCORED_BY           int64
RANK              float64
POPULARITY          int64
FAVORITES           int64
SYNOPSIS           object
STATUS             object
START_DATE         object
FINISH_DATE        object
CLASSIFICATION     object
SOURCE             object
TYPE               object
dtype: object

In [44]:
dates = ['START_DATE', 'FINISH_DATE']

In [45]:
for date in dates:
    df_animes[date] = pd.to_datetime(df_animes[date])

In [49]:
def to_numeric(x: str) -> float:
    try:
        x = float(x)
    except:
        x = np.nan
    return x

In [50]:
df_animes['EPISODES'] = df_animes['EPISODES'].apply(to_numeric)

In [51]:
var_num_animes = [x for x in df_animes.columns if df_animes[x].dtype == 'int64' or df_animes[x].dtype == 'float64']

In [52]:
var_disc_animes = [x for x in df_animes.columns if df_animes[x].dtype == 'object']

### Discreta

In [54]:
df_animes[var_disc_animes].nunique()

ANIME_NAME        12193
URL_IMAGE         12124
SYNOPSIS          11689
STATUS                3
CLASSIFICATION        6
SOURCE               16
TYPE                  6
dtype: int64

In [55]:
var_catalogos_animes = [x for x in df_animes.columns if df_animes[x].nunique() < 20]

#### Gráficos de barras

In [57]:
def others(df1, col):
    df1 = df1.copy()

    freq = (df1[col].value_counts(True) < 0.05)
    if sum(freq)==1:
        return df1
    df1[col] = df1[col].apply(lambda x: x if not freq[x] else 'Otros')
    return df1

In [68]:
df_animes['CLASSIFICATION'] = df_animes['CLASSIFICATION'].fillna('Unknown')

In [76]:
df_animes['AUX'] = 1

In [81]:
for col in var_catalogos_animes:
    print(col)
    display(px.histogram(others(df_animes, col), x=col, title=col, color=col))

STATUS


CLASSIFICATION


SOURCE


TYPE


#### Gráficos de Pie

In [83]:
for col in var_catalogos_animes:
    print(col)
    display(px.pie(others(df_animes, col), names=col, values='ANIME_ID', title=col))

STATUS


CLASSIFICATION


SOURCE


TYPE


#### Tabla de frecuencias

In [89]:
freq_discrete(df_animes, var_catalogos_animes)

Feature: STATUS


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
STATUS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finished Airing,12082,99.08%,12082,99.08%
Currently Airing,90,0.74%,12172,99.82%
Not yet aired,22,0.18%,12194,100.00%


Feature: CLASSIFICATION


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
CLASSIFICATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PG-13 - Teens 13 or older,4444,36.44%,4444,36.44%
G - All Ages,3422,28.06%,7866,64.51%
Rx - Hentai,1141,9.36%,9007,73.86%
PG - Children,1133,9.29%,10140,83.16%
R - 17+ (violence & profanity),895,7.34%,11035,90.50%
R+ - Mild Nudity,838,6.87%,11873,97.37%
Unknown,321,2.63%,12194,100.00%


Feature: SOURCE


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
SOURCE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unknown,4088,33.52%,4088,33.52%
Manga,2742,22.49%,6830,56.01%
Original,2401,19.69%,9231,75.70%
Visual novel,826,6.77%,10057,82.47%
Light novel,487,3.99%,10544,86.47%
Game,460,3.77%,11004,90.24%
Novel,272,2.23%,11276,92.47%
Other,259,2.12%,11535,94.60%
Music,202,1.66%,11737,96.25%
4-koma manga,186,1.53%,11923,97.78%


Feature: TYPE


Unnamed: 0_level_0,Absolute frequency,Relative frequency,Accumulated frequency,Accumulated %
TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TV,3778,30.98%,3778,30.98%
OVA,3292,27.00%,7070,57.98%
Movie,2327,19.08%,9397,77.06%
Special,1640,13.45%,11037,90.51%
ONA,653,5.36%,11690,95.87%
Music,484,3.97%,12174,99.84%
,20,0.16%,12194,100.00%


### Continuo

#### Histogramas

In [91]:
hist_matrix(df_animes, var_num_animes)

#### Cajas

In [92]:
hist_box(df_animes, var_num_animes)

#### Estadísticos

In [93]:
df_animes.describe()

Unnamed: 0,ANIME_ID,EPISODES,SCORED,SCORED_BY,RANK,POPULARITY,FAVORITES,START_DATE,FINISH_DATE,AUX
count,12194.0,11867.0,12194.0,12194.0,11000.0,12194.0,12194.0,10989,10881,12194.0
mean,14000.5555,12.4408,6.4015,13198.5178,6049.428,6739.6311,361.379,2003-07-03 16:47:26.191646208,2003-11-03 04:08:55.980148864,1.0
min,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1917-04-28 00:00:00,1917-04-28 00:00:00,1.0
25%,3467.25,1.0,5.76,90.0,2951.75,3283.0,0.0,1997-02-15 00:00:00,1997-07-12 00:00:00,1.0
50%,10214.5,2.0,6.48,774.0,5927.5,6708.5,3.0,2007-04-03 00:00:00,2007-07-22 00:00:00,1.0
75%,24680.5,12.0,7.13,5466.25,8985.25,10161.75,30.0,2013-01-06 00:00:00,2013-04-01 00:00:00,1.0
max,34527.0,1818.0,9.25,1009477.0,12867.0,14116.0,106895.0,2018-12-11 00:00:00,2019-08-16 00:00:00,1.0
std,11437.6719,47.0154,1.0364,46540.3215,3604.6961,3959.6568,2843.0535,,,0.0


### Multivariado

#### Mapas de calor

In [95]:
px.imshow(df_animes[var_num_animes].corr())

In [100]:
px.scatter_matrix(df_animes, var_num_animes[1:], color='POPULARITY')

In [131]:
df_animes_fecha_inicio = df_animes[['START_DATE', 'ANIME_ID']].groupby('START_DATE').agg('count')
df_animes_fecha_inicio = df_animes_fecha_inicio.reset_index()
px.histogram(df_animes_fecha_inicio, x='START_DATE')

In [134]:
df_animes_fecha_final = df_animes[['FINISH_DATE', 'ANIME_ID']].groupby('FINISH_DATE').agg('count')
df_animes_fecha_final = df_animes_fecha_final.reset_index()
px.histogram(df_animes_fecha_final, x='FINISH_DATE')