In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
numRows = 100
numColumns = 2
data = np.zeros(shape=(numRows, numColumns))
# Year
data[:, 0] = np.random.randint(low=1970, high=2010, size=numRows)

# Price
data[:, 1] = np.round(np.random.uniform(low=8000, high=49999, size=numRows), 2)

df = pd.DataFrame(data, columns=["Year", "Price"])

# Add Fuel
df["Fuel"] = np.random.choice(["Gas", "Diesel"], numRows, p=[0.75, 0.25])

originalDF = df.copy(deep=True)
originalDF.head(10)

Unnamed: 0,Year,Price,Fuel
0,1987.0,9622.0,Gas
1,1974.0,16168.81,Gas
2,2002.0,12557.43,Diesel
3,1990.0,40462.89,Gas
4,1998.0,28830.71,Gas
5,2002.0,24728.16,Gas
6,2005.0,32214.86,Diesel
7,1977.0,21154.94,Gas
8,1972.0,47754.74,Gas
9,2008.0,21371.78,Diesel


# Data Wrangling

## Data Normalization

### Simple Feature Scaling

$$x_{new} = \frac{x_{old}}{x_{max}}$$

In [3]:
print("Prices before Normalization\n", df["Price"].head(10))
df["Price"] = df["Price"] / df["Price"].max()

print("Prices after Normalization\n", df["Price"].head(10))

# Reset the Dataframe for next section
df = originalDF.copy(deep=True)

Prices before Normalization
 0     9622.00
1    16168.81
2    12557.43
3    40462.89
4    28830.71
5    24728.16
6    32214.86
7    21154.94
8    47754.74
9    21371.78
Name: Price, dtype: float64
Prices after Normalization
 0    0.194975
1    0.327636
2    0.254457
3    0.819917
4    0.584209
5    0.501077
6    0.652784
7    0.428672
8    0.967675
9    0.433066
Name: Price, dtype: float64


### Min-Max Scaling

$$x_{new} = \frac{x_{old} - x_{min}}{x_{max} - x_{min}}$$

In [4]:
print("Prices before Normalization\n", df["Price"].head(10))
df["Price"] = (df["Price"] - df["Price"].min()) / (df["Price"].max() - df["Price"].min())
print("Prices after Normalization\n", df["Price"].head(10))
# Reset the Dataframe for next section
df = originalDF.copy(deep=True)

Prices before Normalization
 0     9622.00
1    16168.81
2    12557.43
3    40462.89
4    28830.71
5    24728.16
6    32214.86
7    21154.94
8    47754.74
9    21371.78
Name: Price, dtype: float64
Prices after Normalization
 0    0.039187
1    0.197520
2    0.110180
3    0.785067
4    0.503746
5    0.404526
6    0.585591
7    0.318109
8    0.961419
9    0.323353
Name: Price, dtype: float64


### Z-Score Scaling

$$x_{new} = \frac{x_{old} - \mu}{\sigma}$$

In [5]:
print("Prices before Normalization\n", df["Price"].head(10))
df["Price"] = (df["Price"] - df["Price"].mean()) / df["Price"].std()
print("Prices after Normalization\n", df["Price"].head(10))
# Reset the Dataframe for next section
df = originalDF.copy(deep=True)

Prices before Normalization
 0     9622.00
1    16168.81
2    12557.43
3    40462.89
4    28830.71
5    24728.16
6    32214.86
7    21154.94
8    47754.74
9    21371.78
Name: Price, dtype: float64
Prices after Normalization
 0   -1.570672
1   -1.054518
2   -1.339241
3    0.860841
4   -0.056246
5   -0.379694
6    0.210562
7   -0.661408
8    1.435735
9   -0.644313
Name: Price, dtype: float64


## Binning Data

#### Create Four Bins of equal length

In [6]:
bins = np.linspace(df["Price"].min(), df["Price"].max(), 4)
binNames = ["Low", "Medium", "High"]
df["Price-Binned"] = pd.cut(df["Price"], bins, binNames, include_lowest=True)
df["Price-Binned"]

0      (8001.669, 21784.443]
1      (8001.669, 21784.443]
2      (8001.669, 21784.443]
3      (35567.217, 49349.99]
4     (21784.443, 35567.217]
               ...          
95    (21784.443, 35567.217]
96     (8001.669, 21784.443]
97     (35567.217, 49349.99]
98    (21784.443, 35567.217]
99     (8001.669, 21784.443]
Name: Price-Binned, Length: 100, dtype: category
Categories (3, interval[float64]): [(8001.669, 21784.443] < (21784.443, 35567.217] < (35567.217, 49349.99]]

## Categorical Encoding

In [7]:
# Reset dataframe
df = originalDF.copy(deep=True)

### One-Hot Encoding

In [8]:
pd.get_dummies(df["Fuel"])
df = pd.concat([df, pd.get_dummies(df["Fuel"]).reindex(df.index)], axis=1)
df

Unnamed: 0,Year,Price,Fuel,Diesel,Gas
0,1987.0,9622.00,Gas,0,1
1,1974.0,16168.81,Gas,0,1
2,2002.0,12557.43,Diesel,1,0
3,1990.0,40462.89,Gas,0,1
4,1998.0,28830.71,Gas,0,1
...,...,...,...,...,...
95,1995.0,35538.55,Gas,0,1
96,1989.0,10213.86,Gas,0,1
97,1979.0,45312.04,Diesel,1,0
98,2005.0,27788.44,Gas,0,1


# Exploratory Data Analysis

## Box Plots

<img src="boxplot.jpg" width="300" height="300">

### Importing Data

In [9]:
path='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/automobileEDA.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,0.822681,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,2,164,audi,std,four,sedan,fwd,front,99.8,0.84863,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,2,164,audi,std,four,sedan,4wd,front,99.4,0.84863,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1


# Regression and Pipelines

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [12]:
Input = [("scale", StandardScaler()), ("polynomial", PolynomialFeatures(degree=2)), ("mode", LinearRegression())]
pipeline = Pipeline(Input)
pipeline.fit(df[["horsepower", "curb-weight", "engine-size", "highway-mpg"]], y)
yhat = pipeline.predict(X[["horsepower", "curb-weight", "engine-size", "highway-mpg"]])

NameError: name 'y' is not defined