# It's a continuation from Pandas Data Types

# Import Libraries

In [1]:
import pandas as pd

# Read Datasets

In [2]:
fuel08 = pd.read_csv('data_08_v3.csv')
fuel18 = pd.read_csv('data_18_v3.csv')

# Fix 1: fixing the `cyl` data type

* For the 2008 dataset: extract int from the string
* For the 2018 dataset: convert float to int

In [3]:
fuel08['cyl'] = fuel08['cyl'].astype(str).str.extract(r'(\d+)').astype(int)

## Confirm it extracted the intergers correctly

In [4]:
fuel08['cyl'].value_counts()

cyl
6     409
4     283
8     199
5      48
12     30
10     14
2       2
16      1
Name: count, dtype: int64

In [5]:
fuel08['cyl'].dtype

dtype('int32')

In [6]:
fuel18['cyl']= fuel18['cyl'].astype(int)

In [7]:
fuel18['cyl'].value_counts()

cyl
4     365
6     246
8     153
3      18
12      9
5       2
16      1
Name: count, dtype: int64

# Fix 2: fixing `air_pollution_score` data types

* 2008: convert string to float
* 2018: convert int to float

In [8]:
fuel08['air_pollution_score'].unique()

array(['7', '6', '9.5', '9', '6/4'], dtype=object)

Since some of the pollution is written in the form of a/b, and according to the pdf documentation in [this link](http://www.fueleconomy.gov/feg/findacarhelp.shtml#airPollutionScore):

    "If a vehicle can operate on more than one type of fuel, an estimate is provided for each fuel type."

It means that some of the values represent two pollution value for the hybrid cars.

## Getting all the hybrids in 2008 dataset

In [9]:
hb08 = fuel08.query('fuel.str.contains("/")',engine='python')
hb08 #only has one

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
582,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,ethanol/gas,small car,6/4,13/18,19/25,15/21,7/6,no


## Getting all the hybrids in the 2018 dataset

In [10]:
hb18 = fuel18.query('fuel.str.contains("/")',engine='python')
hb18.head(2)

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
52,BMW 330e,2.0,4,SemiAuto-8,2WD,Gasoline/Electricity,small car,3,28/66,34/78,30/71,10,Yes
78,BMW 530e,2.0,4,SemiAuto-8,2WD,Gasoline/Electricity,small car,7,27/70,31/75,29/72,10,Elite


## Splitting the hybrid row into two new rows

One row will contain the values for the first fuel type and the second row will contain values for the second fuel type (before and after the "/"). 

### Creating two copies of the 2008 hybrid data frame

In [11]:
df1 = hb08.copy()
df2 = hb08.copy()

df1

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
582,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,ethanol/gas,small car,6/4,13/18,19/25,15/21,7/6,no


Using pandas apply function, which basically applies a function to all values of a row. The following code will basically split the coloumns in the hybrid rows that contain a "/" in it and place the values before the "/" in one row and the value after the "/" in another row.

In [12]:
split_columns = ['fuel', 'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg', 'greenhouse_gas_score']


for col in split_columns:
    df1[col] = df1[col].apply(lambda x: x.split("/")[0])
    df2[col] = df2[col].apply(lambda x: x.split("/")[1])

This will contain the ehanol part of the car

In [13]:
df1

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
582,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,ethanol,small car,6,13,19,15,7,no


This will contain the gas part of the car

In [14]:
df2

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
582,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,gas,small car,4,18,25,21,6,no


### Concatenating the new rows

In [15]:
new_rows = pd.concat([df1,df2])

new_rows

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
582,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,ethanol,small car,6,13,19,15,7,no
582,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,gas,small car,4,18,25,21,6,no


### Drop the original row

In [16]:
fuel08.drop(hb08.index,inplace=True)

### Add the new rows

In [17]:
fuel08 = pd.concat([fuel08,new_rows],ignore_index=True)

### Check that the "/" are now gone

In [18]:
fuel08.query('fuel.str.contains("/")',engine='python')

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway


In [19]:
fuel08.shape

(987, 13)

## Splitting hybrid rows for 2018 into seperate rows

### Creating two copies of the 2018 hybrid data frame

In [20]:
df1 = hb18.copy()
df2 = hb18.copy()

### Splitting the columns based on "/"

In [21]:
split_columns = ['fuel', 'city_mpg', 'hwy_mpg', 'cmb_mpg']

for col in split_columns:
    df1[col] = df1[col].apply(lambda x: x.split("/")[0])
    df2[col] = df2[col].apply(lambda x: x.split("/")[1])

### Append the two rows together

In [22]:
new_rows = pd.concat([df1,df2])

In [23]:
new_rows.head()

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
52,BMW 330e,2.0,4,SemiAuto-8,2WD,Gasoline,small car,3,28,34,30,10,Yes
78,BMW 530e,2.0,4,SemiAuto-8,2WD,Gasoline,small car,7,27,31,29,10,Elite
79,BMW 530e,2.0,4,SemiAuto-8,4WD,Gasoline,small car,7,27,31,28,10,Elite
92,BMW 740e,2.0,4,SemiAuto-8,4WD,Gasoline,large car,3,25,29,27,9,Yes
189,CHEVROLET Impala,3.6,6,SemiAuto-6,2WD,Ethanol,large car,5,14,20,16,4,No


### Drop the original hybrid rows

In [24]:
fuel18.drop(hb18.index,inplace=True)

### Adding the new hybrid rows

In [25]:
fuel18 = pd.concat([fuel18,new_rows],ignore_index=True)

### Confirm that they're gone

In [27]:
fuel18.query('fuel.str.contains("/")')

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway


In [28]:
fuel18.shape

(832, 13)

## Now the changes to the `air_pollution_score` can be applied

In [30]:
fuel08['air_pollution_score'] = fuel08['air_pollution_score'].astype(float)

In [32]:
fuel18['air_pollution_score'] = fuel18['air_pollution_score'].astype(float)

# Fix 3: fixing city_mpg, hwy_mpg, cmb_mpg datatypes

* in the 2008 dataset: convert string to float* In the 
201 dataset8: convert int to float

## Getting the columns that end with mpg

In [116]:
import re

mpg_columns =  [x for x in fuel08.columns if re.search(r"(mpg)$", x)] 

mpg_columns

['city_mpg', 'hwy_mpg', 'cmb_mpg']

## Converting the columns from float to int

In [117]:
for col in mpg_columns:
    fuel08[col] = fuel08[col].astype(int)
    fuel18[col] = fuel18[col].astype(int)

## Fix 4: Fixing the `greenhouse_gas_score` data type

* In the 2008 dataset: convert from float to int

In [125]:
import numpy as np

fuel08['greenhouse_gas_score'] = fuel08['greenhouse_gas_score'].astype(np.int64)

# Confirm all the datatypes are fixed

In [126]:
fuel08.dtypes

model                    object
displ                   float64
cyl                       int32
trans                    object
drive                    object
fuel                     object
veh_class                object
air_pollution_score     float64
city_mpg                  int32
hwy_mpg                   int32
cmb_mpg                   int32
greenhouse_gas_score      int64
smartway                 object
dtype: object

In [127]:
fuel18.dtypes

model                    object
displ                   float64
cyl                       int32
trans                    object
drive                    object
fuel                     object
veh_class                object
air_pollution_score     float64
city_mpg                  int32
hwy_mpg                   int32
cmb_mpg                   int32
greenhouse_gas_score      int64
smartway                 object
dtype: object

In [128]:
fuel08.dtypes == fuel18.dtypes 

model                   True
displ                   True
cyl                     True
trans                   True
drive                   True
fuel                    True
veh_class               True
air_pollution_score     True
city_mpg                True
hwy_mpg                 True
cmb_mpg                 True
greenhouse_gas_score    True
smartway                True
dtype: bool

# Saving changes

In [129]:
fuel08.to_csv('clean_08.csv',index=False)
fuel18.to_csv('clean_18.csv',index=False)