# Liberías

In [1]:
import pandas as pd
import zipfile
from bs4 import BeautifulSoup as bs
import requests as req

In [2]:
zip_file_path = '../data/archive.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    netflix = pd.read_csv(zip_ref.extract('Netflix Userbase.csv'))

%rm "Netflix Userbase.csv"

In [3]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   User ID            2500 non-null   int64 
 1   Subscription Type  2500 non-null   object
 2   Monthly Revenue    2500 non-null   int64 
 3   Join Date          2500 non-null   object
 4   Last Payment Date  2500 non-null   object
 5   Country            2500 non-null   object
 6   Age                2500 non-null   int64 
 7   Gender             2500 non-null   object
 8   Device             2500 non-null   object
 9   Plan Duration      2500 non-null   object
dtypes: int64(3), object(7)
memory usage: 195.4+ KB


In [4]:
netflix.head()

Unnamed: 0,User ID,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device,Plan Duration
0,1,Basic,10,15-01-22,10-06-23,United States,28,Male,Smartphone,1 Month
1,2,Premium,15,05-09-21,22-06-23,Canada,35,Female,Tablet,1 Month
2,3,Standard,12,28-02-23,27-06-23,United Kingdom,42,Male,Smart TV,1 Month
3,4,Standard,12,10-07-22,26-06-23,Australia,51,Female,Laptop,1 Month
4,5,Basic,10,01-05-23,28-06-23,Germany,33,Male,Smartphone,1 Month


In [5]:
# Nombres columnas:

netflix.columns = [i.lower().strip().replace(' ','_') for i in netflix.columns]

netflix.head()

Unnamed: 0,user_id,subscription_type,monthly_revenue,join_date,last_payment_date,country,age,gender,device,plan_duration
0,1,Basic,10,15-01-22,10-06-23,United States,28,Male,Smartphone,1 Month
1,2,Premium,15,05-09-21,22-06-23,Canada,35,Female,Tablet,1 Month
2,3,Standard,12,28-02-23,27-06-23,United Kingdom,42,Male,Smart TV,1 Month
3,4,Standard,12,10-07-22,26-06-23,Australia,51,Female,Laptop,1 Month
4,5,Basic,10,01-05-23,28-06-23,Germany,33,Male,Smartphone,1 Month


In [6]:
# Cambiamos el formato de la fecha para ordenarla al revés:

netflix.join_date = [pd.to_datetime(i) for i in netflix.join_date]
netflix.last_payment_date = [pd.to_datetime(i) for i in netflix.last_payment_date]

netflix.head()

Unnamed: 0,user_id,subscription_type,monthly_revenue,join_date,last_payment_date,country,age,gender,device,plan_duration
0,1,Basic,10,2022-01-15,2023-10-06,United States,28,Male,Smartphone,1 Month
1,2,Premium,15,2021-05-09,2023-06-22,Canada,35,Female,Tablet,1 Month
2,3,Standard,12,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV,1 Month
3,4,Standard,12,2022-10-07,2023-06-26,Australia,51,Female,Laptop,1 Month
4,5,Basic,10,2023-01-05,2023-06-28,Germany,33,Male,Smartphone,1 Month


In [7]:
netflix.plan_duration.unique() # Esta columna sobra, el resto están bien

array(['1 Month'], dtype=object)

In [8]:
netflix.drop(columns = 'plan_duration', inplace = True)

netflix.head()

Unnamed: 0,user_id,subscription_type,monthly_revenue,join_date,last_payment_date,country,age,gender,device
0,1,Basic,10,2022-01-15,2023-10-06,United States,28,Male,Smartphone
1,2,Premium,15,2021-05-09,2023-06-22,Canada,35,Female,Tablet
2,3,Standard,12,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV
3,4,Standard,12,2022-10-07,2023-06-26,Australia,51,Female,Laptop
4,5,Basic,10,2023-01-05,2023-06-28,Germany,33,Male,Smartphone


# Puliendo la information: precio de cada suscripción en dólares estadounidenses

La columna de ingresos debería contener la tarifa de suscripción de cada tipo de suscripción. Sin embargo, estos números no tienen unidades y no son muy precisos, por lo que los cambiaré por su precio real en 2023.

In [9]:
netflix.subscription_type.unique() # Hay sólo tres tipos

array(['Basic', 'Premium', 'Standard'], dtype=object)

In [10]:
url = 'https://beebom.com/how-much-netflix-costs-each-country-worldwide/'

soup = bs(req.get(url).content,'html.parser')

rows = soup.find_all('table')[7].find_all('tr')

rows[1]

<tr><td>Afghanistan</td><td class="has-text-align-center" data-align="center"></td><td class="has-text-align-center" data-align="center">$3.99</td><td class="has-text-align-center" data-align="center">$7.99</td><td class="has-text-align-center" data-align="center">$9.99</td></tr>

In [11]:
table = []

for i in rows:
    row = []
    for j in i:
        row.append(j.text)
        
    table.append(row)

table = pd.DataFrame(table)

table.head()

Unnamed: 0,0,1,2,3,4
0,Country,Mobile,Basic,Standard,Premium
1,Afghanistan,,$3.99,$7.99,$9.99
2,Åland Islands,,7.99€,11.99€,15.99€
3,Albania,,4.99€,7.99€,9.99€
4,Algeria,,$3.99,$7.99,$9.99


In [12]:
col_names = list(table.loc[0])

col_names

['Country', 'Mobile', 'Basic', 'Standard', 'Premium']

In [13]:
table.drop(index = 0, inplace = True)
table.reset_index(drop = True, inplace = True)
table.columns = col_names

table.head()

Unnamed: 0,Country,Mobile,Basic,Standard,Premium
0,Afghanistan,,$3.99,$7.99,$9.99
1,Åland Islands,,7.99€,11.99€,15.99€
2,Albania,,4.99€,7.99€,9.99€
3,Algeria,,$3.99,$7.99,$9.99
4,American Samoa,,$9.99,$15.49,$19.99


In [14]:
# Pasemos todo a dólares. Veamos qué símbolos tenemos:

table.Premium.unique()

array(['$9.99', '15.99€', '9.99€', '$19.99', '17.99€', '$15.99',
       '1899 ARS', '11.99€', 'AU$22.99', '$15.79', '$18.79', '$7.99',
       '$11.99', '159kr', 'R$55,90', '$13.99', '20.99 CAD', '$10,700 CLP',
       '38,900 COP', 'NZ$24.99', '13.99€', '319,00 Kč', '149kr', '$10.99',
       '165 EGP', '129kr', '£ 15.99', '93 HKD', '4490 Ft', '649 INR',
       '186,000 IDR', '20.99€', '60.90 ILS', '1,980 JPY', '1,100 KES',
       '$14.99', '24,90 CHF', '55 MYR', '$299 MXN', '95 MAD', '4,400 NGN',
       'Rs 1,100', '$12.99', 'PEN 44.90', '549 PHP', '60 PLN', '61 SAR',
       '21.98 SGD', '199 ZAR', '17,000 KRW', '179 kr', '390 TWD',
       '419 THB', '130.99 TRY', '\xa0$9.99', '56 AED', '260,000 VND'],
      dtype=object)

In [15]:
# Sacado de Bard (actualizado a fecha de 11 agosto de 2023):

dictio = {
    '$': 1.0,
    '€': 1.099,
    'ARS': 0.00344,
    'NZ$': 0.602,
    'CAD': 0.786,
    'CLP': 0.00146,
    'COP': 0.000255,
    'AU$': 0.731,
    'kr': 0.117,  # Assuming SEK, NOK, DKK, and ISK
    'R$': 0.189,
    'EGP': 0.0633,
    '£': 1.372,
    'HKD': 0.128,
    'Ft': 0.00304,
    'INR': 0.0134,
    'IDR': 0.000069,
    'ILS': 0.302,
    'JPY': 0.00906,
    'KES': 0.0090,
    'CHF': 1.09,
    'MYR': 0.234,
    'MXN': 0.0496,
    'MAD': 0.111,
    'NGN': 0.00241,
    'PEN': 0.259,
    'PHP': 0.0195,
    'PLN': 0.262,
    'SAR': 0.265,
    'SGD': 0.728,
    'ZAR': 0.0671,
    'KRW': 0.000079,
    'TWD': 0.0356,
    'THB': 0.0299,
    'TRY': 0.115,
    'AED': 0.271,
    'VND': 0.000042,
    'CA$': 0.785
}

In [16]:
def cambio_moneda(x):
    
    res = 0
    a = x.replace(',','.').replace('Rs','$').replace('\xa0$','$')
    for i in dictio.keys():
        if i in a:
            try:
                res = round(float(a.replace(i,'').replace('$','').strip())*dictio[i],2)
            except:
                continue
    return res

In [17]:
# Cambiemos las columnas de precio:

tochange = list(table.columns)[1:]

tochange

['Mobile', 'Basic', 'Standard', 'Premium']

In [18]:
for i in tochange:
    table[i] = table[i].apply(cambio_moneda)
    
table

Unnamed: 0,Country,Mobile,Basic,Standard,Premium
0,Afghanistan,0.00,3.99,7.99,9.99
1,Åland Islands,0.00,8.78,13.18,17.57
2,Albania,0.00,5.48,8.78,10.98
3,Algeria,0.00,3.99,7.99,9.99
4,American Samoa,0.00,9.99,15.49,19.99
...,...,...,...,...,...
239,Wallis & Futuna,2.99,3.99,7.99,9.99
240,Western Sahara,0.00,7.99,9.99,11.99
241,Yemen,0.00,3.99,7.99,9.99
242,Zambia,2.99,3.99,7.99,9.99


In [19]:
# Quitemos la columna de Mobile, que no está contemplada en la otra tabla:

table.drop(columns = 'Mobile', inplace = True)

table.head()

Unnamed: 0,Country,Basic,Standard,Premium
0,Afghanistan,3.99,7.99,9.99
1,Åland Islands,8.78,13.18,17.57
2,Albania,5.48,8.78,10.98
3,Algeria,3.99,7.99,9.99
4,American Samoa,9.99,15.49,19.99


In [20]:
countries = netflix.country.unique() # Países de nuestra tabla de Netflix

countries

array(['United States', 'Canada', 'United Kingdom', 'Australia',
       'Germany', 'France', 'Brazil', 'Mexico', 'Spain', 'Italy'],
      dtype=object)

In [21]:
exists = [i in list(table.Country) for i in countries]

exists # Están todos los países, y con el mismo nombre aparentemente.

[True, True, True, True, True, True, True, True, True, True]

In [22]:
# Añadamos la columna:

netflix['subscription_price'] = [table[table.Country == netflix.loc[i]['country']][netflix.loc[i]['subscription_type']].iloc[0] \
                                 for i in range(netflix.shape[0])]
# Nos deshacemos de la otra:

netflix.drop(columns = 'monthly_revenue', inplace = True)

netflix.head()

Unnamed: 0,user_id,subscription_type,join_date,last_payment_date,country,age,gender,device,subscription_price
0,1,Basic,2022-01-15,2023-10-06,United States,28,Male,Smartphone,9.99
1,2,Premium,2021-05-09,2023-06-22,Canada,35,Female,Tablet,16.5
2,3,Standard,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV,15.08
3,4,Standard,2022-10-07,2023-06-26,Australia,51,Female,Laptop,12.42
4,5,Basic,2023-01-05,2023-06-28,Germany,33,Male,Smartphone,8.78


# Expandiendo la información: Purchasing Power Parity (PPP)

Para poder comparar los precios de Netflix entre los países de forma relativa, sería interesante conocer el valor del PPP de cada país. Al multiplicar el precio de un bien de consumo por el valor de este parámetro, ajustamos el precio a al contexto económico concreto de cada país, de manera que podemos entender mejor si ese bien es más o menos caro teniendo en cuenta su realidad económica.

Para obtener estos datos nos descargamos un csv de la siguiente url, lo guardamos en la carpeta 'data' y lo importamos: 

https://worldpopulationreview.com/country-rankings/purchasing-power-parity-by-country

In [23]:
ppp = pd.read_csv('../data/purchasing-power-parity-by-country-2023.csv')

ppp.head()

Unnamed: 0,country,pppInt,pppUsd,pop2023
0,Afghanistan,18.46,,42239854.0
1,Albania,42.79,42.513,2832439.0
2,Algeria,41.1,,45606480.0
3,Angola,204.06,,36684202.0
4,Antigua and Barbuda,2.03,,94298.0


In [24]:
# Observamos que la columna que contiene el dato que nos interesa es la primera después de la de países. 
# Comprobamos que el PPP de USA es 1:

ppp[ppp.country == 'United States']

Unnamed: 0,country,pppInt,pppUsd,pop2023
188,United States,1.0,1.0,339996563.0


In [25]:
# También es interesante la columna de la derecha del todo, que representa la población total de cada país en 2023.
# Eliminemos la de en medio, cambiemos los nombres y filtremos la parte de la tabla que nos interesa:

ppp.drop(columns = 'pppUsd', inplace = True)
ppp.columns = ['country','PPP','population']

ppp.head()

Unnamed: 0,country,PPP,population
0,Afghanistan,18.46,42239854.0
1,Albania,42.79,2832439.0
2,Algeria,41.1,45606480.0
3,Angola,204.06,36684202.0
4,Antigua and Barbuda,2.03,94298.0


In [26]:
# Quedémonos solo con los países que nos interesan esta vez, sin liarla tanto

countries = netflix.country.unique()

In [27]:
exists = [i in list(ppp.country) for i in countries]

exists # De nuevo, están todos los países.

[True, True, True, True, True, True, True, True, True, True]

In [28]:
# Añadamos las dos nuevas columnas:

netflix['PPP'] = [ppp[ppp.country == netflix.loc[i]['country']]['PPP'].iloc[0] \
                                 for i in range(netflix.shape[0])]

netflix['population'] = [int(ppp[ppp.country == netflix.loc[i]['country']]['population'].iloc[0]) \
                                 for i in range(netflix.shape[0])]

netflix.head()

Unnamed: 0,user_id,subscription_type,join_date,last_payment_date,country,age,gender,device,subscription_price,PPP,population
0,1,Basic,2022-01-15,2023-10-06,United States,28,Male,Smartphone,9.99,1.0,339996563
1,2,Premium,2021-05-09,2023-06-22,Canada,35,Female,Tablet,16.5,1.25,38781291
2,3,Standard,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV,15.08,0.69,67736802
3,4,Standard,2022-10-07,2023-06-26,Australia,51,Female,Laptop,12.42,1.44,26439111
4,5,Basic,2023-01-05,2023-06-28,Germany,33,Male,Smartphone,8.78,0.74,83294633


In [29]:
# Y añadamos también el precio relativo, resultante de multiplicar el precio en dólares por el PPP del país
# correspondiente:

netflix['relative_PPP_price'] = round(netflix.subscription_price*netflix.PPP,2)

netflix = netflix.reindex(columns = ['user_id','join_date','last_payment_date','country',
                                     'age','gender','device','subscription_type','subscription_price',
                                     'PPP','relative_PPP_price','population'])

netflix.head()

Unnamed: 0,user_id,join_date,last_payment_date,country,age,gender,device,subscription_type,subscription_price,PPP,relative_PPP_price,population
0,1,2022-01-15,2023-10-06,United States,28,Male,Smartphone,Basic,9.99,1.0,9.99,339996563
1,2,2021-05-09,2023-06-22,Canada,35,Female,Tablet,Premium,16.5,1.25,20.62,38781291
2,3,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV,Standard,15.08,0.69,10.41,67736802
3,4,2022-10-07,2023-06-26,Australia,51,Female,Laptop,Standard,12.42,1.44,17.88,26439111
4,5,2023-01-05,2023-06-28,Germany,33,Male,Smartphone,Basic,8.78,0.74,6.5,83294633


# Guardado en csv

Ahora que ya tenemos todo listo, guardamos la tabla en un csv para que sea accesible desde Tableau o Power BI.

In [30]:
# Vamos a cambiar los puntos de los campos decimales por comas 
# y dejar esas columnas en formato string:

netflix.subscription_price = netflix.subscription_price.apply(lambda x: str(x).replace('.',','))
netflix.PPP = netflix.PPP.apply(lambda x: str(x).replace('.',','))
netflix.relative_PPP_price = netflix.relative_PPP_price.apply(lambda x: str(x).replace('.',','))


netflix.head()

Unnamed: 0,user_id,join_date,last_payment_date,country,age,gender,device,subscription_type,subscription_price,PPP,relative_PPP_price,population
0,1,2022-01-15,2023-10-06,United States,28,Male,Smartphone,Basic,999,10,999,339996563
1,2,2021-05-09,2023-06-22,Canada,35,Female,Tablet,Premium,165,125,2062,38781291
2,3,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV,Standard,1508,69,1041,67736802
3,4,2022-10-07,2023-06-26,Australia,51,Female,Laptop,Standard,1242,144,1788,26439111
4,5,2023-01-05,2023-06-28,Germany,33,Male,Smartphone,Basic,878,74,65,83294633


In [31]:
netflix.to_csv('../data/netflix_clean.csv')