In [1]:
import pandas as pd
import numpy as np


In [2]:
## Importe de datos

headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
datos = pd.read_csv("https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv", names = headers) 

In [3]:
## Limpieza de datos

datos = datos.replace('?', np.nan)

nor_lss_promedio = datos["normalized-losses"].astype(float).mean(axis=0)
ske_promedio = datos["stroke"].astype(float).mean(axis=0)
bre_promedio = datos["bore"].astype(float).mean(axis=0)
hs_pw_promedio = datos["horsepower"].astype(float).mean(axis=0)
pk_rpm_promedio = datos["peak-rpm"].astype(float).mean(axis=0)

datos["normalized-losses"].replace(np.nan, nor_lss_promedio, inplace = True)
datos["stroke"].replace(np.nan, ske_promedio , inplace = True)
datos["bore"].replace(np.nan, bre_promedio, inplace = True)
datos["horsepower"].replace(np.nan, hs_pw_promedio , inplace = True)
datos["peak-rpm"].replace(np.nan, pk_rpm_promedio , inplace = True)

datos["num-of-doors"].replace(np.nan, "four" , inplace = True)

datos.dropna(subset = ["price"] , axis = 0 , inplace = True)

datos[["bore", "stroke", "price", "peak-rpm"]] = datos[["bore", "stroke", "price", "peak-rpm"]].astype(float)
datos[["normalized-losses"]] = datos[["normalized-losses"]].astype(int)


datos["city-L/100km"] = 235/datos["city-mpg"]  
datos["highway-L/100km"] = 235/datos["highway-mpg"] 

datos["length"] = datos["length"]/datos["length"].max()
datos["width"] = datos["width"]/datos["width"].max()
datos["height"] = datos["height"]/datos["height"].max()

In [4]:
## Como vemos son números en su mayoría enteros, sin embargo el tipo de dato es object
## por lo que lo cambiaremos a int

datos.horsepower = datos.horsepower.astype(int)

## Separemos los datos en tres bins iguales

bins = np.linspace( datos["horsepower"].min() , datos["horsepower"].max() , 4)  ## Separamos en tres bins
nombre_grupos = ["Bajo", "Medio", "Alto"]               ## Nombres de cada bin

## Agregamos una columna adicional con los datos bineados

datos["horsepower-bineado"] = pd.cut(datos["horsepower"], bins, labels = nombre_grupos, include_lowest=True) 


In [5]:
## Vamos a separar los datos en grupos para poder estudiar la información

grupo1 = datos.groupby(["make", "num-of-doors"])  ## agrupamos por hecho y número de puertas

In [6]:
grupo1.groups.keys()  ## Vemos las formas en las que se han agrupado


dict_keys([('alfa-romero', 'two'), ('audi', 'four'), ('audi', 'two'), ('bmw', 'four'), ('bmw', 'two'), ('chevrolet', 'four'), ('chevrolet', 'two'), ('dodge', 'four'), ('dodge', 'two'), ('honda', 'four'), ('honda', 'two'), ('isuzu', 'four'), ('isuzu', 'two'), ('jaguar', 'four'), ('jaguar', 'two'), ('mazda', 'four'), ('mazda', 'two'), ('mercedes-benz', 'four'), ('mercedes-benz', 'two'), ('mercury', 'two'), ('mitsubishi', 'four'), ('mitsubishi', 'two'), ('nissan', 'four'), ('nissan', 'two'), ('peugot', 'four'), ('plymouth', 'four'), ('plymouth', 'two'), ('porsche', 'two'), ('renault', 'four'), ('renault', 'two'), ('saab', 'four'), ('saab', 'two'), ('subaru', 'four'), ('subaru', 'two'), ('toyota', 'four'), ('toyota', 'two'), ('volkswagen', 'four'), ('volkswagen', 'two'), ('volvo', 'four')])

In [7]:
## Probemos agrupando también por el precio calculando su promedio

grupo1.mean()["price"]


make           num-of-doors
alfa-romero    two             15498.333333
audi           four            18381.000000
               two             15250.000000
bmw            four            26047.000000
               two             26238.333333
chevrolet      four             6575.000000
               two              5723.000000
dodge          four             7601.800000
               two              8217.500000
honda          four             9335.000000
               two              7465.750000
isuzu          four             6785.000000
               two             11048.000000
jaguar         four            33900.000000
               two             36000.000000
mazda          four            11436.750000
               two              9956.111111
mercedes-benz  four            32108.800000
               two             36210.666667
mercury        two             16503.000000
mitsubishi     four             8434.000000
               two              9597.888889
niss

In [8]:
datos.groupby(["make", "horsepower-bineado" , "drive-wheels"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,body-style,engine-location,wheel-base,length,width,...,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,highway-L/100km
make,horsepower-bineado,drive-wheels,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
alfa-romero,Bajo,4wd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alfa-romero,Bajo,fwd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alfa-romero,Bajo,rwd,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
alfa-romero,Medio,4wd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alfa-romero,Medio,fwd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
volvo,Medio,fwd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
volvo,Medio,rwd,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
volvo,Alto,4wd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
volvo,Alto,fwd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
datos.groupby(["make", "horsepower-bineado" , "drive-wheels"]).mean()["price"]

make         horsepower-bineado  drive-wheels
alfa-romero  Bajo                4wd                 NaN
                                 fwd                 NaN
                                 rwd             14997.5
             Medio               4wd                 NaN
                                 fwd                 NaN
                                                  ...   
volvo        Medio               fwd                 NaN
                                 rwd             19475.0
             Alto                4wd                 NaN
                                 fwd                 NaN
                                 rwd                 NaN
Name: price, Length: 198, dtype: float64

In [10]:
datos.groupby(["make", "horsepower-bineado" , "drive-wheels"]).std()["stroke"]

make         horsepower-bineado  drive-wheels
alfa-romero  Bajo                4wd              NaN
                                 fwd              NaN
                                 rwd             0.00
             Medio               4wd              NaN
                                 fwd              NaN
                                                 ... 
volvo        Medio               fwd              NaN
                                 rwd             0.14
             Alto                4wd              NaN
                                 fwd              NaN
                                 rwd              NaN
Name: stroke, Length: 198, dtype: float64

In [11]:
## Trabajemos con otro DataFrame

datos2 = pd.DataFrame({"Persona": ["John", "Myla", "Lewis", "John", "Myla", "Lewis", "John", "Myla"], 
                    "Edad": [24, 55, 24, 21, 26, 54, 64, 55], "Mes":[1,1,1,3,3,3,5,5]})
datos2.head(8)

Unnamed: 0,Persona,Edad,Mes
0,John,24,1
1,Myla,55,1
2,Lewis,24,1
3,John,21,3
4,Myla,26,3
5,Lewis,54,3
6,John,64,5
7,Myla,55,5


In [12]:
datos2.dtypes

Persona    object
Edad        int64
Mes         int64
dtype: object

In [13]:
datos2[datos2.Persona.str.startswith('J')]  ## nos muestra las entradas de persona que comiencen por J

Unnamed: 0,Persona,Edad,Mes
0,John,24,1
3,John,21,3
6,John,64,5


In [14]:
datos2[datos2.Persona.str.endswith('s')]  ## nos muestra las entradas de persona que terminan por s

Unnamed: 0,Persona,Edad,Mes
2,Lewis,24,1
5,Lewis,54,3


In [15]:
datos2.groupby("Persona").min()['Edad']   ## Nos muestra la edad mínima de las personas con cada nombre

Persona
John     21
Lewis    24
Myla     26
Name: Edad, dtype: int64

In [16]:
datos2.groupby("Persona").max()['Edad']   ## Nos muestra la edad máxima de las personas con cada nombre

Persona
John     64
Lewis    54
Myla     55
Name: Edad, dtype: int64

In [17]:
datos2.groupby("Persona").mean()['Edad']   ## Nos muestra la edad promedio de las personas con cada nombre

Persona
John     36.333333
Lewis    39.000000
Myla     45.333333
Name: Edad, dtype: float64

In [18]:
datos2.groupby("Persona").std()['Mes']   ## Nos muestra la desviación estándar del mes por las personas con cada nombre

Persona
John     2.000000
Lewis    1.414214
Myla     2.000000
Name: Mes, dtype: float64

In [19]:
datos2.groupby('Persona').agg([min, max]) ## nos permite agrupar por persona y calcular el máximo y el mínimo

# del mes y la edad

Unnamed: 0_level_0,Edad,Edad,Mes,Mes
Unnamed: 0_level_1,min,max,min,max
Persona,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
John,21,64,1,5
Lewis,24,54,1,3
Myla,26,55,1,5


In [20]:
datos2.groupby("Persona").count()  ## Cuenta la cantidad de personas con el mismo nombre

Unnamed: 0_level_0,Edad,Mes
Persona,Unnamed: 1_level_1,Unnamed: 2_level_1
John,3,3
Lewis,2,2
Myla,3,3


In [21]:
## Podemos agrupar por persona y aplicar estadísticas diferentes a las demás columnas

datos2.groupby("Persona").agg({"Edad": "mean" , "Mes": "max"})


Unnamed: 0_level_0,Edad,Mes
Persona,Unnamed: 1_level_1,Unnamed: 2_level_1
John,36.333333,5
Lewis,39.0,3
Myla,45.333333,5


In [22]:
## Incluso tener varias operaciones

datos2.groupby("Persona").agg({"Edad": ["min", "max"] , "Mes": ["mean", "std" , np.sum]})

Unnamed: 0_level_0,Edad,Edad,Mes,Mes,Mes
Unnamed: 0_level_1,min,max,mean,std,sum
Persona,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
John,21,64,3.0,2.0,9
Lewis,24,54,2.0,1.414214,4
Myla,26,55,3.0,2.0,9


In [23]:
## Ahora probaremos la concatenación

## Creamos los DataFrames

raw_data_1 = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}

raw_data_2 = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}

raw_data_3 = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}

data1 = pd.DataFrame(raw_data_1, columns = ['subject_id', 'first_name', 'last_name'])
data2 = pd.DataFrame(raw_data_2, columns = ['subject_id', 'first_name', 'last_name'])
data3 = pd.DataFrame(raw_data_3, columns = ['subject_id','test_id'])

In [24]:
## Concatenemos por filas

datos_concatenados1 = pd.concat([data1, data2])  ## Tenemos las mismas columnas
datos_concatenados1

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [25]:
datos_concatenados2 = pd.concat([data1,data2,data3])  ## tenemos diferentes columnas
datos_concatenados2

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,
1,2,Amy,Ackerman,
2,3,Allen,Ali,
3,4,Alice,Aoni,
4,5,Ayoung,Atiches,
0,4,Billy,Bonder,
1,5,Brian,Black,
2,6,Bran,Balwner,
3,7,Bryce,Brice,
4,8,Betty,Btisan,


In [26]:
## Organicemos los índices del último DataFrame



In [27]:
## Concatenemos por columnas

datos_concatenados3 = pd.concat([data1, data2, data3], axis = 1 , keys = ["1", "2", "3"])
datos_concatenados3

Unnamed: 0_level_0,1,1,1,2,2,2,3,3
Unnamed: 0_level_1,subject_id,first_name,last_name,subject_id,first_name,last_name,subject_id,test_id
0,1.0,Alex,Anderson,4.0,Billy,Bonder,1,51
1,2.0,Amy,Ackerman,5.0,Brian,Black,2,15
2,3.0,Allen,Ali,6.0,Bran,Balwner,3,15
3,4.0,Alice,Aoni,7.0,Bryce,Brice,4,61
4,5.0,Ayoung,Atiches,8.0,Betty,Btisan,5,16
5,,,,,,,7,14
6,,,,,,,8,15
7,,,,,,,9,1
8,,,,,,,10,61
9,,,,,,,11,16


In [28]:
datos_concatenados3.shape

(10, 8)

In [29]:
datos_concatenados3.columns  ## Veamos los nombres de las columnas

MultiIndex([('1', 'subject_id'),
            ('1', 'first_name'),
            ('1',  'last_name'),
            ('2', 'subject_id'),
            ('2', 'first_name'),
            ('2',  'last_name'),
            ('3', 'subject_id'),
            ('3',    'test_id')],
           )

In [30]:
## Ahora, vamos a combinar no concatenar todos los datos y data3 a lo largo del valor de subject_id
## Como vemos todos los DataFrames tienen en común esta columna

pd.merge(datos_concatenados2 , data3 , on = 'subject_id')


Unnamed: 0,subject_id,first_name,last_name,test_id_x,test_id_y
0,1,Alex,Anderson,,51
1,1,,,51.0,51
2,2,Amy,Ackerman,,15
3,2,,,15.0,15
4,3,Allen,Ali,,15
5,3,,,15.0,15
6,4,Alice,Aoni,,61
7,4,Billy,Bonder,,61
8,4,,,61.0,61
9,5,Ayoung,Atiches,,16


In [31]:
datos_concatenados2

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,
1,2,Amy,Ackerman,
2,3,Allen,Ali,
3,4,Alice,Aoni,
4,5,Ayoung,Atiches,
0,4,Billy,Bonder,
1,5,Brian,Black,
2,6,Bran,Balwner,
3,7,Bryce,Brice,
4,8,Betty,Btisan,


In [32]:
data3

Unnamed: 0,subject_id,test_id
0,1,51
1,2,15
2,3,15
3,4,61
4,5,16
5,7,14
6,8,15
7,9,1
8,10,61
9,11,16


In [33]:
pd.merge(datos_concatenados1 , data3 , on = 'subject_id')

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,4,Billy,Bonder,61
5,5,Ayoung,Atiches,16
6,5,Brian,Black,16
7,7,Bryce,Brice,14
8,8,Betty,Btisan,15


In [34]:
## Hagamos una intersección entre DataFrames

pd.merge(data1, data2, on='subject_id', how='inner')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


In [35]:
## Hagamos los unión entre DataFrames

pd.merge(data1, data2, on='subject_id', how='outer')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black
5,6,,,Bran,Balwner
6,7,,,Bryce,Brice
7,8,,,Betty,Btisan


In [36]:
(data1.columns == data2.columns).all()

True

In [37]:
(data1.columns == data3.columns).all()

ValueError: Lengths must match to compare

In [38]:
data1.shape

(5, 3)

In [39]:
data3.shape

(10, 2)

In [42]:
raw_data_4 = {
        'subject_id': ['1', '2', '3', '4', '5' , '6'],
        'test_id': [51, 15, 15, 61, 16, 21],
        'lo_que_sea': [3, 2, 4, 4, 5 , 6]}

data4 = pd.DataFrame(raw_data_4, columns = ['subject_id','test_id', 'lo_que_sea'])

In [43]:
(data1.columns == data4.columns).all()

False