In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [27]:
numRows = 100
numColumns = 2
data = np.zeros(shape=(numRows, numColumns))
# Year
data[:, 0] = np.random.randint(low=1970, high=2010, size=numRows)

# Price
data[:, 1] = np.round(np.random.uniform(low=8000, high=49999, size=numRows), 2)

df = pd.DataFrame(data, columns=["Year", "Price"])

# Add Fuel
df["Fuel"] = np.random.choice(["Gas", "Diesel"], numRows, p=[0.75, 0.25])

originalDF = df.copy(deep=True)
originalDF.head(10)

Unnamed: 0,Year,Price,Fuel
0,1995.0,33830.17,Gas
1,2003.0,38551.6,Gas
2,1971.0,14142.3,Gas
3,1988.0,16194.56,Gas
4,1985.0,44012.93,Diesel
5,2007.0,32176.68,Gas
6,1983.0,11715.9,Gas
7,2001.0,46560.89,Diesel
8,1973.0,8567.3,Gas
9,1985.0,32552.3,Diesel


# Data Wrangling

## Data Normalization

### Simple Feature Scaling

$$x_{new} = \frac{x_{old}}{x_{max}}$$

In [19]:
print("Prices before Normalization\n", df["Price"].head(10))
df["Price"] = df["Price"] / df["Price"].max()

print("Prices after Normalization\n", df["Price"].head(10))

# Reset the Dataframe for next section
df = originalDF.copy(deep=True)

Prices before Normalization
 0    47332.33
1    12892.17
2    24812.52
3    18798.21
4    27198.54
5    17286.64
6    47277.57
7    35729.30
8    41044.16
9    12501.93
Name: Price, dtype: float64
Prices after Normalization
 0    0.967389
1    0.263493
2    0.507124
3    0.384202
4    0.555890
5    0.353308
6    0.966270
7    0.730244
8    0.838870
9    0.255517
Name: Price, dtype: float64


### Min-Max Scaling

$$x_{new} = \frac{x_{old} - x_{min}}{x_{max} - x_{min}}$$

In [20]:
print("Prices before Normalization\n", df["Price"].head(10))
df["Price"] = (df["Price"] - df["Price"].min()) / (df["Price"].max() - df["Price"].min())
print("Prices after Normalization\n", df["Price"].head(10))
# Reset the Dataframe for next section
df = originalDF.copy(deep=True)

Prices before Normalization
 0    47332.33
1    12892.17
2    24812.52
3    18798.21
4    27198.54
5    17286.64
6    47277.57
7    35729.30
8    41044.16
9    12501.93
Name: Price, dtype: float64
Prices after Normalization
 0    0.961004
1    0.119282
2    0.410617
3    0.263626
4    0.468931
5    0.226683
6    0.959665
7    0.677424
8    0.807320
9    0.109744
Name: Price, dtype: float64


### Z-Score Scaling

$$x_{new} = \frac{x_{old} - \mu}{\sigma}$$

In [21]:
print("Prices before Normalization\n", df["Price"].head(10))
df["Price"] = (df["Price"] - df["Price"].mean()) / df["Price"].std()
print("Prices after Normalization\n", df["Price"].head(10))
# Reset the Dataframe for next section
df = originalDF.copy(deep=True)

Prices before Normalization
 0    47332.33
1    12892.17
2    24812.52
3    18798.21
4    27198.54
5    17286.64
6    47277.57
7    35729.30
8    41044.16
9    12501.93
Name: Price, dtype: float64
Prices after Normalization
 0    1.480861
1   -1.220548
2   -0.285543
3   -0.757292
4   -0.098388
5   -0.875856
6    1.476566
7    0.570745
8    0.987631
9   -1.251158
Name: Price, dtype: float64


## Binning Data

#### Create Four Bins of equal length

In [28]:
bins = np.linspace(df["Price"].min(), df["Price"].max(), 4)
binNames = ["Low", "Medium", "High"]
df["Price-Binned"] = pd.cut(df["Price"], bins, binNames, include_lowest=True)
df["Price-Binned"]

0      (22043.72, 35925.4]
1      (35925.4, 49807.08]
2     (8162.039, 22043.72]
3     (8162.039, 22043.72]
4      (35925.4, 49807.08]
              ...         
95     (22043.72, 35925.4]
96     (22043.72, 35925.4]
97    (8162.039, 22043.72]
98     (35925.4, 49807.08]
99     (22043.72, 35925.4]
Name: Price-Binned, Length: 100, dtype: category
Categories (3, interval[float64]): [(8162.039, 22043.72] < (22043.72, 35925.4] < (35925.4, 49807.08]]

## Categorical Encoding

In [35]:
# Reset dataframe
df = originalDF.copy(deep=True)

### One-Hot Encoding

In [36]:
pd.get_dummies(df["Fuel"])
df = pd.concat([df, pd.get_dummies(df["Fuel"]).reindex(df.index)], axis=1)
df

Unnamed: 0,Year,Price,Fuel,Diesel,Gas
0,1995.0,33830.17,Gas,0,1
1,2003.0,38551.60,Gas,0,1
2,1971.0,14142.30,Gas,0,1
3,1988.0,16194.56,Gas,0,1
4,1985.0,44012.93,Diesel,1,0
...,...,...,...,...,...
95,1990.0,24665.51,Diesel,1,0
96,1977.0,26828.20,Gas,0,1
97,2000.0,11709.96,Gas,0,1
98,1998.0,36872.46,Gas,0,1


# Exploratory Data Analysis

## Box Plots

<img src="boxplot.jpg" width="300" height="300">

### Importing Data

In [3]:
path='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/automobileEDA.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,0.822681,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,2,164,audi,std,four,sedan,fwd,front,99.8,0.84863,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,2,164,audi,std,four,sedan,4wd,front,99.4,0.84863,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1
