# Fusión de datasets  

Importemos pandas

In [1]:
import pandas as pd

## Fusión uno a uno sobre columnas comunes 

### Creemos los dataframe para poder hacer el merge

## Dataset 1

In [2]:
%%writefile dataset/clients.csv
clientId,key,name
11,B,Ferris Q. Le
10,A,Omar Y. Fletcher
11,A,Mira N. Franklin
10,B,Buffy W. Vincent

Overwriting dataset/clients.csv


## Dataset 2

In [3]:
%%writefile dataset/bonus.csv
clientId,key,bonus
10,A,138
10,B,227
11,A,279
11,B,160

Overwriting dataset/bonus.csv


## Inicializemos los dataframes y hagamos el merge

In [4]:
df_clientes = pd.read_csv("dataset/clients.csv")


df_bonus = pd.read_csv("dataset/bonus.csv")

print(
    df_clientes.sort_values(["clientId", "key"]),
    df_bonus.sort_values(["clientId", "key"]),

    pd.merge(
        df_clientes, 
        df_bonus, 
        sort=True
    ),
    sep="\n\n",
)


   clientId key              name
1        10   A  Omar Y. Fletcher
3        10   B  Buffy W. Vincent
2        11   A  Mira N. Franklin
0        11   B      Ferris Q. Le

   clientId key  bonus
0        10   A    138
1        10   B    227
2        11   A    279
3        11   B    160

   clientId key              name  bonus
0        10   A  Omar Y. Fletcher    138
1        10   B  Buffy W. Vincent    227
2        11   A  Mira N. Franklin    279
3        11   B      Ferris Q. Le    160


## Fusión uno a uno con registros incompletos

creemos los datasets

In [5]:
%%writefile dataset/clients.csv
clientId,name
10,Ferris Q. Le
11,Mira N. Franklin
12,Baker C. Hurst

Overwriting dataset/clients.csv


In [6]:
%%writefile dataset/bonus.csv
clientId,bonus
10,279
11,160
20,169
21,263

Overwriting dataset/bonus.csv


hagamos el merge 

In [7]:
df_clients = pd.read_csv("dataset/clients.csv")
df_bonus = pd.read_csv("dataset/bonus.csv")

#
# Note que merge solo retorno los registros con
# información completa
#
print(
    df_clients,
    "",
    df_bonus,
    "",
    pd.merge(
        df_clients,
        df_bonus,
        on="clientId", #indicamos por cual columna se hará el merge
    ),
    sep="\n",
)

   clientId              name
0        10      Ferris Q. Le
1        11  Mira N. Franklin
2        12    Baker C. Hurst

   clientId  bonus
0        10    279
1        11    160
2        20    169
3        21    263

   clientId              name  bonus
0        10      Ferris Q. Le    279
1        11  Mira N. Franklin    160


## Fusión muchos a uno 

creamos los dataset

In [8]:
%%writefile dataset/clients.csv
clientId,name
10,Ferris Q. Le
11,Mira N. Franklin
12,Baker C. Hurst

Overwriting dataset/clients.csv


In [9]:
%%writefile dataset/sales.csv
clientId, month, sales
10, jan, 1239
10, feb, 387
11, jan, 454
11, mar, 495
11, sep, 145
12, may, 4959
12, dec, 493
12, oct, 4981
12, jan, 484
15, may, 394
15, sep, 585

Overwriting dataset/sales.csv


hacemos el merge 

In [10]:
df_clients = pd.read_csv("dataset/clients.csv")
df_sales = pd.read_csv("dataset/sales.csv")

#
# Sales tiene valores del campo clientId
# duplicados. Note que se copia la información
# de la tabla de la derecha
#
print(
    df_clients,
    "",
    df_sales,
    "",
    pd.merge(
        df_sales,
        df_clients,
        sort=True,
    ),
    sep="\n",
)

   clientId              name
0        10      Ferris Q. Le
1        11  Mira N. Franklin
2        12    Baker C. Hurst

    clientId  month   sales
0         10    jan    1239
1         10    feb     387
2         11    jan     454
3         11    mar     495
4         11    sep     145
5         12    may    4959
6         12    dec     493
7         12    oct    4981
8         12    jan     484
9         15    may     394
10        15    sep     585

   clientId  month   sales              name
0        10    jan    1239      Ferris Q. Le
1        10    feb     387      Ferris Q. Le
2        11    jan     454  Mira N. Franklin
3        11    mar     495  Mira N. Franklin
4        11    sep     145  Mira N. Franklin
5        12    may    4959    Baker C. Hurst
6        12    dec     493    Baker C. Hurst
7        12    oct    4981    Baker C. Hurst
8        12    jan     484    Baker C. Hurst


## Fusión muchos a muchos


creamos los dataset

In [11]:
%%writefile dataset/sales.csv
clientId,month
10,jan
10,feb
11,jan
11,mar
11,sep
12,may
12,dec
12,oct
12,jan

Overwriting dataset/sales.csv


In [12]:
%%writefile dataset/lines.csv
clientId,line
10,A
10,B
10,C
11,D
12,A
12,D
13,B
13,C
13,D

Overwriting dataset/lines.csv


hacemos el merge

In [13]:
df_sales = pd.read_csv("dataset/sales.csv")
df_lines = pd.read_csv("dataset/lines.csv")

print(
    df_sales,
    "",
    df_lines,
    "",
    pd.merge(
        df_sales,
        df_lines,
    ),
    sep="\n",
)

   clientId month
0        10   jan
1        10   feb
2        11   jan
3        11   mar
4        11   sep
5        12   may
6        12   dec
7        12   oct
8        12   jan

   clientId line
0        10    A
1        10    B
2        10    C
3        11    D
4        12    A
5        12    D
6        13    B
7        13    C
8        13    D

    clientId month line
0         10   jan    A
1         10   jan    B
2         10   jan    C
3         10   feb    A
4         10   feb    B
5         10   feb    C
6         11   jan    D
7         11   mar    D
8         11   sep    D
9         12   may    A
10        12   may    D
11        12   dec    A
12        12   dec    D
13        12   oct    A
14        12   oct    D
15        12   jan    A
16        12   jan    D


## Parámetros left_on y right_on


estos nos sirven para indicar con que columna de la tabla izquierda y derecha haremos el merge

In [14]:
%%writefile dataset/clients.csv
clientId,name
13,Ferris Q. Le
10,Omar Y. Fletcher
12,Mira N. Franklin
11,Buffy W. Vincent

Overwriting dataset/clients.csv


In [15]:
%%writefile dataset/bonus.csv
Client-Id,bonus
10,279
11,160
12,267
13,215

Overwriting dataset/bonus.csv


hagamos el merge con las propiedades left_on y right_on

In [16]:
df_clients = pd.read_csv("dataset/clients.csv")
df_bonus = pd.read_csv("dataset/bonus.csv")

#
# Note que las dos claves aparecen en la tabla
#
print(
    df_clients,
    "",
    df_bonus,
    "",
    pd.merge(
        df_clients,
        df_bonus,
        left_on="clientId",
        right_on="Client-Id",
    ),
    sep="\n",
)

   clientId              name
0        13      Ferris Q. Le
1        10  Omar Y. Fletcher
2        12  Mira N. Franklin
3        11  Buffy W. Vincent

   Client-Id  bonus
0         10    279
1         11    160
2         12    267
3         13    215

   clientId              name  Client-Id  bonus
0        13      Ferris Q. Le         13    215
1        10  Omar Y. Fletcher         10    279
2        12  Mira N. Franklin         12    267
3        11  Buffy W. Vincent         11    160


# JOINs usando el parámetro HOW

Primero creemos los datasets a los que les haremos los diferentes JOIN

In [17]:
%%writefile dataset/clients_a.csv
clientId,name,location,amount
10,Omar Y. Fletcher,6833 Mollis. Rd.,4929
11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue",7366
12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd.",6184

Overwriting dataset/clients_a.csv


In [18]:
%%writefile dataset/clients_b.csv
clientId,name,location,bonus
11,Buffy W. Vincent,"P.O. Box 345, 8390 Ante Avenue",100
12,Mira N. Franklin,"P.O. Box 445, 323 Cursus Rd.",200
13,Lilah O. Morrison,3859 Mauris Ave,300

Overwriting dataset/clients_b.csv


Creemos los dataframes de pandas

In [19]:
df_clients_a = pd.read_csv("dataset/clients_a.csv")

df_clients_b = pd.read_csv("dataset/clients_b.csv")

#Veamos las columnas diferentes entre ambos
display(
    set(df_clients_a) - set(df_clients_b),
    set(df_clients_b) - set(df_clients_a)
        )

{'amount'}

{'bonus'}

## inner JOIN

In [20]:
print(
    df_clients_a,
    "",
    df_clients_b,
    "",

    pd.merge(
        df_clients_a,
        df_clients_b,
        how="inner",
    ),
    sep="\n",
)

   clientId              name                        location  amount
0        10  Omar Y. Fletcher                6833 Mollis. Rd.    4929
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184

   clientId               name                        location  bonus
0        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    100
1        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    200
2        13  Lilah O. Morrison                 3859 Mauris Ave    300

   clientId              name                        location  amount  bonus
0        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366    100
1        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184    200


## left JOIN

In [21]:
print(
    df_clients_a,
    "",
    df_clients_b,
    "",

    pd.merge(
        df_clients_a,
        df_clients_b,
        how="left",
    ),
    sep="\n",
)

   clientId              name                        location  amount
0        10  Omar Y. Fletcher                6833 Mollis. Rd.    4929
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184

   clientId               name                        location  bonus
0        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    100
1        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    200
2        13  Lilah O. Morrison                 3859 Mauris Ave    300

   clientId              name                        location  amount  bonus
0        10  Omar Y. Fletcher                6833 Mollis. Rd.    4929    NaN
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366  100.0
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184  200.0


## right JOIN

In [22]:
print(
    df_clients_a,
    "",
    df_clients_b,
    "",

    pd.merge(
        df_clients_a,
        df_clients_b,
        how="right",
    ),
    sep="\n",
)

   clientId              name                        location  amount
0        10  Omar Y. Fletcher                6833 Mollis. Rd.    4929
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184

   clientId               name                        location  bonus
0        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    100
1        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    200
2        13  Lilah O. Morrison                 3859 Mauris Ave    300

   clientId               name                        location  amount  bonus
0        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue  7366.0    100
1        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.  6184.0    200
2        13  Lilah O. Morrison                 3859 Mauris Ave     NaN    300


## outer JOIN

In [23]:
print(
    df_clients_a,
    "",
    df_clients_b,
    "",

    pd.merge(
        df_clients_a,
        df_clients_b,
        how="outer",
    ),
    sep="\n",
)

   clientId              name                        location  amount
0        10  Omar Y. Fletcher                6833 Mollis. Rd.    4929
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184

   clientId               name                        location  bonus
0        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    100
1        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    200
2        13  Lilah O. Morrison                 3859 Mauris Ave    300

   clientId               name                        location  amount  bonus
0        10   Omar Y. Fletcher                6833 Mollis. Rd.  4929.0    NaN
1        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue  7366.0  100.0
2        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.  6184.0  200.0
3        13  Lilah O. Morrison                 3859 Mauris Ave     NaN  300.0


podemos agregarle el indicador que señala de que tabla(o dataframe) provienen los datos (left, right o both)

In [24]:
print(
    df_clients_a,
    "",
    df_clients_b,
    "",

    pd.merge(
        df_clients_a,
        df_clients_b,
        how="outer",
        indicator=True
    ),
    sep="\n",
)

   clientId              name                        location  amount
0        10  Omar Y. Fletcher                6833 Mollis. Rd.    4929
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    7366
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    6184

   clientId               name                        location  bonus
0        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue    100
1        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.    200
2        13  Lilah O. Morrison                 3859 Mauris Ave    300

   clientId               name                        location  amount  bonus  \
0        10   Omar Y. Fletcher                6833 Mollis. Rd.  4929.0    NaN   
1        11   Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue  7366.0  100.0   
2        12   Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.  6184.0  200.0   
3        13  Lilah O. Morrison                 3859 Mauris Ave     NaN  300.0   

       _merge  
0   left_only  


## Sufijos 

Estos son usados para columnas no claves repetidas (con el mismo nombre)

In [25]:
%%writefile dataset/data_1.csv
clientId,info
10,Omar Y. Fletcher
11,Buffy W. Vincent
12,Mira N. Franklin

Writing dataset/data_1.csv


In [26]:
%%writefile dataset/data_2.csv
clientId,info
10,6833 Mollis. Rd.
11,"P.O. Box 345, 8390 Ante Avenue"
12,"P.O. Box 445, 323 Cursus Rd."

Writing dataset/data_2.csv


In [None]:
df_1 = pd.read_csv("dataset/data_1.csv")
df_2 = pd.read_csv("dataset/data_2.csv")

#
# Sufijos generados para desambiguar los nombres
# de las columnas
#
print(
    df_1,
    "",
    df_2,
    "",
    pd.merge(
        df_1,
        df_2,
        on="clientId",
        suffixes= ["_LD", "_RD"] #los sufijos por defecto son _x y _y
    ),
    sep="\n",
)

   clientId              info
0        10  Omar Y. Fletcher
1        11  Buffy W. Vincent
2        12  Mira N. Franklin

   clientId                            info
0        10                6833 Mollis. Rd.
1        11  P.O. Box 345, 8390 Ante Avenue
2        12    P.O. Box 445, 323 Cursus Rd.

   clientId           info_LD                         info_RD
0        10  Omar Y. Fletcher                6833 Mollis. Rd.
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.


Es mejor y más coherente renombrar las columnas

In [None]:
df_1 = pd.read_csv("dataset/data_1.csv")
df_2 = pd.read_csv("dataset/data_2.csv")

#
# Sufijos generados para desambiguar los nombres
# de las columnas
#
print(
    df_1,
    "",
    df_2,
    "",
    pd.merge(
        df_1,
        df_2,
        on="clientId",
    ).rename(
        columns={
            "info_x" : "name", 
            "info_y" : "location"
        }
        ),
    sep="\n",
)

   clientId              info
0        10  Omar Y. Fletcher
1        11  Buffy W. Vincent
2        12  Mira N. Franklin

   clientId                            info
0        10                6833 Mollis. Rd.
1        11  P.O. Box 345, 8390 Ante Avenue
2        12    P.O. Box 445, 323 Cursus Rd.

   clientId              name                        location
0        10  Omar Y. Fletcher                6833 Mollis. Rd.
1        11  Buffy W. Vincent  P.O. Box 345, 8390 Ante Avenue
2        12  Mira N. Franklin    P.O. Box 445, 323 Cursus Rd.
