In [1]:
import numpy as np
import pandas as pd
from statistics import mean

1.- Uno de los primeros pasos para realizar un buen análisis de datos es familiarizarnos
con los datos que contiene el fichero a analizar. Para ello, calcularemos los estadísticos
descriptivos elementales de las variables del fichero. Una vez cargados los datos en
nuestro programa Python (utilizando la librería Pandas), calcula los siguientes valores
para cada una de las variables:
• Número de muestras (valores distintos de missing)
• Media y desviación estándar de aquellas variables en las que tenga sentido
(numéricas)
• Valor mínimo y valor máximo de aquellas variables en las que tenga sentido
(numéricas)

In [2]:
data = pd.read_csv("C:/Users/Juanjo/Escritorio/MasterDataScience/ProgramacionOrientada/PracticaPy/Fitness_trackers.csv")
df = pd.DataFrame(data)

#Corrección comas
df.loc[:,'Selling Price'] = df.loc[:,'Selling Price'].str.replace(',','')
df.loc[:,'Original Price'] = df.loc[:,'Original Price'].str.replace(',','')
#Visualización del dataframe
df.dtypes

Brand Name                         object
Device Type                        object
Model Name                         object
Color                              object
Selling Price                      object
Original Price                     object
Display                            object
Rating (Out of 5)                 float64
Strap Material                     object
Average Battery Life (in days)      int64
Reviews                            object
dtype: object

In [3]:
#Corregimos tipos de variables.
df['Selling Price'] = df['Selling Price'].astype('float64')
df['Original Price'] = df['Original Price'].astype('float64')
df.dtypes

Brand Name                         object
Device Type                        object
Model Name                         object
Color                              object
Selling Price                     float64
Original Price                    float64
Display                            object
Rating (Out of 5)                 float64
Strap Material                     object
Average Battery Life (in days)      int64
Reviews                            object
dtype: object

In [4]:
#Numero de muestras
df.count()

Brand Name                        565
Device Type                       565
Model Name                        565
Color                             565
Selling Price                     565
Original Price                    565
Display                           565
Rating (Out of 5)                 514
Strap Material                    565
Average Battery Life (in days)    565
Reviews                            78
dtype: int64

In [5]:
#Visualizamos los NAs.
df.isna().sum()

Brand Name                          0
Device Type                         0
Model Name                          0
Color                               0
Selling Price                       0
Original Price                      0
Display                             0
Rating (Out of 5)                  51
Strap Material                      0
Average Battery Life (in days)      0
Reviews                           487
dtype: int64

In [6]:
num = pd.DataFrame(df, columns=['Selling Price','Original Price','Rating (Out of 5)','Average Battery Life (in days)'])
num.head()

Unnamed: 0,Selling Price,Original Price,Rating (Out of 5),Average Battery Life (in days)
0,2499.0,2999.0,4.1,14
1,2099.0,2499.0,4.2,14
2,1722.0,2099.0,3.5,14
3,2469.0,2999.0,4.1,14
4,1799.0,2199.0,4.3,7


In [7]:
#Media de variables numéricas
num.mean()

Selling Price                     22110.373451
Original Price                    25365.361062
Rating (Out of 5)                     4.229961
Average Battery Life (in days)        9.026549
dtype: float64

In [8]:
#STD variables numéricas
num.std()

Selling Price                     19914.926066
Original Price                    20384.028759
Rating (Out of 5)                     0.390827
Average Battery Life (in days)        7.868670
dtype: float64

2.- Hay datos que nos interesa analizar basándonos en agrupaciones, para darle un 
sentido a nuestro análisis en base a esa agrupación. Basándonos en las siguientes 
agrupaciones:
• Por tipo de dispositivo
• Por precio de venta. Estableceremos cuatro grupos en base a la media del precio 
de venta de cada tipo de dispositivo:
o Smartwatches con un precio inferior o igual a la media de precios de 
venta de estos dispositivos
o Smartwatches con un precio superior a la media de precios de venta de 
estos dispositivos
o Fitnessbands con un precio inferior o igual a la media de precios de venta 
de estos dispositivos
o Fitnessbands con un precio superior a la media de precios de venta de 
estos dispositivos
• Por marca


In [9]:
#Agrupación por Device Type
df_ordenado = df.sort_values('Device Type')
df_tipo = df_ordenado.set_index(['Device Type', 'Brand Name'])
df_tipo

Unnamed: 0_level_0,Unnamed: 1_level_0,Model Name,Color,Selling Price,Original Price,Display,Rating (Out of 5),Strap Material,Average Battery Life (in days),Reviews
Device Type,Brand Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
FitnessBand,Xiaomi,Smart Band 5,Black,2499.0,2999.0,AMOLED Display,4.1,Thermoplastic polyurethane,14,
FitnessBand,Huawei,Band 2 Pro Activity,"Pink, Black",2199.0,5999.0,PMOLED Display,4.1,Silicone,7,
FitnessBand,Huawei,Band 6,"Pink, Black, Teal, Orange",4490.0,9990.0,PMOLED Display,4.4,Silicone,7,
FitnessBand,Huawei,Band 2,Black,4599.0,4599.0,PMOLED Display,4.1,Silicone,7,
FitnessBand,Huawei,Band 2,Blue,6999.0,6999.0,PMOLED Display,4.2,Silicone,7,
...,...,...,...,...,...,...,...,...,...,...
Smartwatch,FOSSIL,FB-01 Hybrid HR,Black,12396.0,14995.0,AMOLED Display,4.0,Silicone,14,
Smartwatch,FOSSIL,Gen 5 Garrett HR,Black,14995.0,22995.0,AMOLED Display,4.1,Stainless Steel,2,
Smartwatch,FOSSIL,Latitude Hybrid HR,Black,12396.0,14995.0,AMOLED Display,4.3,Stainless Steel,14,
Smartwatch,FOSSIL,Gen 5 Julianna HR,Gold,18495.0,18495.0,AMOLED Display,3.8,Stainless Steel,2,


In [10]:
#Media Fitnessband
df_fb = df_tipo.loc['FitnessBand']
media_fb = df_fb['Selling Price'].mean()
media_fb

5479.213333333333

In [11]:
#Agrego nueva columna 'Comparativa'
#df_fb.loc['Comparativa'] = df_fb.loc['Selling Price'] 
df_tipo['Comparativa'] = np.nan


In [12]:
#Media Smartwatch
df_sw = df_tipo.loc['Smartwatch']
media_sw = df_sw['Selling Price'].mean()
media_sw
sw_altos = df_sw.loc[df_sw['Selling Price'] >= media_sw]


In [13]:
#Agrupación por marca
df_ordenado = df.sort_values('Brand Name')
df_marca = df.set_index(['Brand Name', 'Device Type'])
df_marca

Unnamed: 0_level_0,Unnamed: 1_level_0,Model Name,Color,Selling Price,Original Price,Display,Rating (Out of 5),Strap Material,Average Battery Life (in days),Reviews
Brand Name,Device Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Xiaomi,FitnessBand,Smart Band 5,Black,2499.0,2999.0,AMOLED Display,4.1,Thermoplastic polyurethane,14,
Xiaomi,FitnessBand,Smart Band 4,Black,2099.0,2499.0,AMOLED Display,4.2,Thermoplastic polyurethane,14,
Xiaomi,FitnessBand,HMSH01GE,Black,1722.0,2099.0,LCD Display,3.5,Leather,14,
Xiaomi,FitnessBand,Smart Band 5,Black,2469.0,2999.0,AMOLED Display,4.1,Thermoplastic polyurethane,14,
Xiaomi,FitnessBand,Band 3,Black,1799.0,2199.0,OLED Display,4.3,Plastic,7,
...,...,...,...,...,...,...,...,...,...,...
Huawei,Smartwatch,Watch 36456,Black,55000.0,55000.0,AMOLED Display,4.1,Silicone,14,
Huawei,Smartwatch,GT Fortuna-B19S Sport,Black,13990.0,20990.0,AMOLED Display,4.1,Elastomer,14,
GOQii,FitnessBand,HR,Black,1999.0,1999.0,OLED Display,3.8,Silicone,7,
GOQii,FitnessBand,Vital,Black,3499.0,3499.0,OLED Display,3.7,Thermoplastic polyurethane,7,
