# MPG Cars

### Introduction:

The following exercise utilizes data from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Auto+MPG)

### Step 1. Import the necessary libraries

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types 
import requests

spark = SparkSession.builder\
                    .appName('mpg_cars')\
                    .getOrCreate()

### Step 2. Import the first dataset [cars1](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars1.csv) and [cars2](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars2.csv).  

   ### Step 3. Assign each to a variable called cars1 and cars2

In [50]:
import requests
import pandas as pd

def csv_generator(url, name):
    response = requests.get(url)
    
    filename = f'data_{name}.csv' 
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    return filename


data = csv_generator('https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars1.csv', 'cars1')
df_car1 = pd.read_csv(data)
cars1 = spark.read.csv(data, header=True,  sep=',')

data2 = csv_generator('https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars2.csv', 'cars2')
df_car2 = pd.read_csv(data2)
cars2 = spark.read.csv(data2, header=True,  sep=',')

In [51]:
cars1.show()

+----+---------+------------+----------+------+------------+-----+------+--------------------+----+----+----+----+----+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car| _c9|_c10|_c11|_c12|_c13|
+----+---------+------------+----------+------+------------+-----+------+--------------------+----+----+----+----+----+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|NULL|NULL|NULL|NULL|NULL|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|NULL|NULL|NULL|NULL|NULL|
|18.0|        8|         318|       150|  3436|        11.0|   70|     1|  plymouth satellite|NULL|NULL|NULL|NULL|NULL|
|16.0|        8|         304|       150|  3433|        12.0|   70|     1|       amc rebel sst|NULL|NULL|NULL|NULL|NULL|
|17.0|        8|         302|       140|  3449|        10.5|   70|     1|         ford torino|NULL|NULL|NULL|NULL|NULL|
|15.0|        8|         429|       198|

25/06/13 15:21:51 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: mpg, cylinders, displacement, horsepower, weight, acceleration, model, origin, car, , , , , 
 Schema: mpg, cylinders, displacement, horsepower, weight, acceleration, model, origin, car, _c9, _c10, _c11, _c12, _c13
Expected: _c9 but found: 
CSV file: file:///home/kevin-llanos/Proyectos/pyspark_exercises/05_Merge/Auto_MPG/data_cars1.csv


In [52]:
df_car1.head()



Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,,,,,
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,,,,,
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,,,,,
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,,,,,
4,17.0,8,302,140,3449,10.5,70,1,ford torino,,,,,


In [53]:
cars2.show()

+----+---------+------------+----------+------+------------+-----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
|33.0|        4|          91|        53|  1795|        17.4|   76|     3|         honda civic|
|20.0|        6|         225|       100|  3651|        17.7|   76|     1|      dodge aspen se|
|18.0|        6|         250|        78|  3574|        21.0|   76|     1|   ford granada ghia|
|18.5|        6|         250|       110|  3645|        16.2|   76|     1|  pontiac ventura sj|
|17.5|        6|         258|        95|  3193|        17.8|   76|     1|       amc pacer d/l|
|29.5|        4|          97|        71|  1825|        12.2|   76|     2|   volkswagen rabbit|
|32.0|        4|          85|        70|  1990|        17.0|   76|     3|        datsun b-210|
|28.0|        4|          97|        75|  2155|   

In [54]:
df_car2.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,33.0,4,91,53,1795,17.4,76,3,honda civic
1,20.0,6,225,100,3651,17.7,76,1,dodge aspen se
2,18.0,6,250,78,3574,21.0,76,1,ford granada ghia
3,18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj
4,17.5,6,258,95,3193,17.8,76,1,amc pacer d/l


### Step 4. Oops, it seems our first dataset has some unnamed blank columns, fix cars1

## 🐍 **Solución Pyspark** 

In [55]:
cars1.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in cars1.columns]).show()

+---+---------+------------+----------+------+------------+-----+------+---+---+----+----+----+----+
|mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|car|_c9|_c10|_c11|_c12|_c13|
+---+---------+------------+----------+------+------------+-----+------+---+---+----+----+----+----+
|  0|        0|           0|         0|     0|           0|    0|     0|  0|198| 198| 198| 198| 198|
+---+---------+------------+----------+------+------------+-----+------+---+---+----+----+----+----+



25/06/13 15:21:51 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: mpg, cylinders, displacement, horsepower, weight, acceleration, model, origin, car, , , , , 
 Schema: mpg, cylinders, displacement, horsepower, weight, acceleration, model, origin, car, _c9, _c10, _c11, _c12, _c13
Expected: _c9 but found: 
CSV file: file:///home/kevin-llanos/Proyectos/pyspark_exercises/05_Merge/Auto_MPG/data_cars1.csv


In [56]:
cars1 = cars1.drop(F.col('_c9'), F.col('_c10'), F.col('_c11'), F.col('_c12'), F.col('_c12'), F.col('_c13'))
cars1.show()

+----+---------+------------+----------+------+------------+-----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|
|18.0|        8|         318|       150|  3436|        11.0|   70|     1|  plymouth satellite|
|16.0|        8|         304|       150|  3433|        12.0|   70|     1|       amc rebel sst|
|17.0|        8|         302|       140|  3449|        10.5|   70|     1|         ford torino|
|15.0|        8|         429|       198|  4341|        10.0|   70|     1|    ford galaxie 500|
|14.0|        8|         454|       220|  4354|         9.0|   70|     1|    chevrolet impala|
|14.0|        8|         440|       215|  4312|   

In [57]:
df_car1.drop(columns=[	'Unnamed: 9',	'Unnamed: 10',	'Unnamed: 11',	'Unnamed: 12',	'Unnamed: 13'], inplace=True)

In [58]:
df_car1.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302,140,3449,10.5,70,1,ford torino


### Step 5. What is the number of observations in each dataset?

In [59]:
cars1.count()

198

In [60]:
cars2.count()

200

In [61]:
df_car1.shape[0]

198

In [62]:
df_car2.shape[0]

200

### Step 6. Join cars1 and cars2 into a single DataFrame called cars

In [63]:
cars1.show(3)

+----+---------+------------+----------+------+------------+-----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|
|18.0|        8|         318|       150|  3436|        11.0|   70|     1|  plymouth satellite|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
only showing top 3 rows



In [64]:
cars2.show(3)

+----+---------+------------+----------+------+------------+-----+------+-----------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|              car|
+----+---------+------------+----------+------+------------+-----+------+-----------------+
|33.0|        4|          91|        53|  1795|        17.4|   76|     3|      honda civic|
|20.0|        6|         225|       100|  3651|        17.7|   76|     1|   dodge aspen se|
|18.0|        6|         250|        78|  3574|        21.0|   76|     1|ford granada ghia|
+----+---------+------------+----------+------+------------+-----+------+-----------------+
only showing top 3 rows



In [65]:
cars = cars1.union(cars2)

cars.show(10)

+----+---------+------------+----------+------+------------+-----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|
|18.0|        8|         318|       150|  3436|        11.0|   70|     1|  plymouth satellite|
|16.0|        8|         304|       150|  3433|        12.0|   70|     1|       amc rebel sst|
|17.0|        8|         302|       140|  3449|        10.5|   70|     1|         ford torino|
|15.0|        8|         429|       198|  4341|        10.0|   70|     1|    ford galaxie 500|
|14.0|        8|         454|       220|  4354|         9.0|   70|     1|    chevrolet impala|
|14.0|        8|         440|       215|  4312|   

In [66]:
cars_df = pd.concat([df_car1, df_car2], axis=0)

In [68]:
cars_df.shape[0]

398

### Step 7. Oops, there is a column missing, called owners. Create a random number Series from 15,000 to 73,000. ### Step 8. Add the column owners to cars


In [76]:
import numpy as np


min_owners = 15000
max_owners = 73000
range_owners = max_owners - min_owners

cars = cars.withColumn('owner', (F.rand() * range_owners + min_owners).cast(types.IntegerType()))

cars.show()

+----+---------+------------+----------+------+------------+-----+------+--------------------+-----+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|owner|
+----+---------+------------+----------+------+------------+-----+------+--------------------+-----+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|47345|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|27939|
|18.0|        8|         318|       150|  3436|        11.0|   70|     1|  plymouth satellite|60118|
|16.0|        8|         304|       150|  3433|        12.0|   70|     1|       amc rebel sst|32434|
|17.0|        8|         302|       140|  3449|        10.5|   70|     1|         ford torino|18147|
|15.0|        8|         429|       198|  4341|        10.0|   70|     1|    ford galaxie 500|45884|
|14.0|        8|         454|       220|  4354|         9.0|   70|     1|    chevrolet impa

In [77]:

cars_df['owners'] = np.random.randint(15000, 73001, cars_df.shape[0])


cars_df


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car,owners
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,15755
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,22366
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,50673
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,36338
4,17.0,8,302,140,3449,10.5,70,1,ford torino,29251
...,...,...,...,...,...,...,...,...,...,...
195,27.0,4,140,86,2790,15.6,82,1,ford mustang gl,64365
196,44.0,4,97,52,2130,24.6,82,2,vw pickup,54283
197,32.0,4,135,84,2295,11.6,82,1,dodge rampage,69044
198,28.0,4,120,79,2625,18.6,82,1,ford ranger,21483


In [78]:
spark.stop()