# 06 - Combining datasets

### Step 1. Import the necessary libraries

In [9]:
import pandas as pd
import numpy as np

### Step 2. Import the datasets you'll find in the folder `data` and assign each to a variable called cars1 and cars2

The following exercise uses data from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Auto+MPG)

In [10]:
cars1 = pd.read_csv('cars1.csv')
cars2 = pd.read_csv('cars2.csv')

In [11]:
cars1.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,,,,,
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,,,,,
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,,,,,
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,,,,,
4,17.0,8,302,140,3449,10.5,70,1,ford torino,,,,,


In [12]:
cars2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,33.0,4,91,53,1795,17.4,76,3,honda civic
1,20.0,6,225,100,3651,17.7,76,1,dodge aspen se
2,18.0,6,250,78,3574,21.0,76,1,ford granada ghia
3,18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj
4,17.5,6,258,95,3193,17.8,76,1,amc pacer d/l


### Step 3. Have a look at the columns of the dataset.

In [13]:
print("cars1 columns:", cars1.columns)
print("\n")
print("cars2 columns:", cars2.columns)

cars1 columns: Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model', 'origin', 'car', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')


cars2 columns: Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model', 'origin', 'car'],
      dtype='object')


### Step 4. Oops, it seems our first dataset has some unnamed blank columns, fix cars1

In [14]:
cars1 = cars1.dropna(axis = 1)

In [15]:
cars1

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
193,24.0,6,200,81,3012,17.6,76,1,ford maverick
194,22.5,6,232,90,3085,17.6,76,1,amc hornet
195,29.0,4,85,52,2035,22.2,76,1,chevrolet chevette
196,24.5,4,98,60,2164,22.1,76,1,chevrolet woody


### Step 5. What is the number of observations in each dataset?

In [16]:
print("number of observations in cars1:", len(cars1))
print("number of observations in cars2:", len(cars2))

number of observations in cars1: 198
number of observations in cars2: 200


### Step 6. Join cars1 and cars2 into a single DataFrame called cars

In [21]:
# No estoy seguro de que esto sea lo que pides, pero como dices "join" pues tire palante con esa función
cars = cars1.join(cars2, lsuffix = '_1', rsuffix = '_2')
cars

Unnamed: 0,mpg_1,cylinders_1,displacement_1,horsepower_1,weight_1,acceleration_1,model_1,origin_1,car_1,mpg_2,cylinders_2,displacement_2,horsepower_2,weight_2,acceleration_2,model_2,origin_2,car_2
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,33.0,4,91,53,1795,17.4,76,3,honda civic
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,20.0,6,225,100,3651,17.7,76,1,dodge aspen se
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,18.0,6,250,78,3574,21.0,76,1,ford granada ghia
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj
4,17.0,8,302,140,3449,10.5,70,1,ford torino,17.5,6,258,95,3193,17.8,76,1,amc pacer d/l
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,24.0,6,200,81,3012,17.6,76,1,ford maverick,36.0,4,135,84,2370,13.0,82,1,dodge charger 2.2
194,22.5,6,232,90,3085,17.6,76,1,amc hornet,27.0,4,151,90,2950,17.3,82,1,chevrolet camaro
195,29.0,4,85,52,2035,22.2,76,1,chevrolet chevette,27.0,4,140,86,2790,15.6,82,1,ford mustang gl
196,24.5,4,98,60,2164,22.1,76,1,chevrolet woody,44.0,4,97,52,2130,24.6,82,2,vw pickup


### Step 7. Oops, there is a column missing, called owners. Create a random number Series from 15,000 to 73,000.

In [31]:
array = np.random.randint(15000, 73000, size = len(cars))
s = pd.Series(array)
s.name = "Owner"
s

0      48811
1      64334
2      17180
3      29905
4      59468
       ...  
193    59416
194    37849
195    24381
196    70691
197    50672
Name: Owner, Length: 198, dtype: int64

### Step 8. Add the column owners to cars

In [33]:
cars = pd.concat([cars, s], axis = 1)

In [34]:
cars

Unnamed: 0,mpg_1,cylinders_1,displacement_1,horsepower_1,weight_1,acceleration_1,model_1,origin_1,car_1,mpg_2,cylinders_2,displacement_2,horsepower_2,weight_2,acceleration_2,model_2,origin_2,car_2,Owner
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,33.0,4,91,53,1795,17.4,76,3,honda civic,48811
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,20.0,6,225,100,3651,17.7,76,1,dodge aspen se,64334
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,18.0,6,250,78,3574,21.0,76,1,ford granada ghia,17180
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj,29905
4,17.0,8,302,140,3449,10.5,70,1,ford torino,17.5,6,258,95,3193,17.8,76,1,amc pacer d/l,59468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,24.0,6,200,81,3012,17.6,76,1,ford maverick,36.0,4,135,84,2370,13.0,82,1,dodge charger 2.2,59416
194,22.5,6,232,90,3085,17.6,76,1,amc hornet,27.0,4,151,90,2950,17.3,82,1,chevrolet camaro,37849
195,29.0,4,85,52,2035,22.2,76,1,chevrolet chevette,27.0,4,140,86,2790,15.6,82,1,ford mustang gl,24381
196,24.5,4,98,60,2164,22.1,76,1,chevrolet woody,44.0,4,97,52,2130,24.6,82,2,vw pickup,70691
