In [2]:
import pandas as pd  
import matplotlib as plt  
import numpy as np

## Getting our data ready to be used with machine learning  

Three main things we have to do:  

1. Split the data into features and labels( usually `X`and `y`) 
2. Filling or imputing or disregarding missing values  
3. Feature Encoding: Non Numerical values to Numerical values

In [8]:
car_sales = pd.read_csv("car-sales-extended.csv")  
car_sales.head()  

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [9]:
len(car_sales)

1000

In [10]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [11]:
from sklearn.model_selection import train_test_split   

X = car_sales.drop("Price", axis=1) 
y = car_sales["Price"]  

# Split into training and tesing  
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)  

In [7]:
# Build Machine Learning Model  

from sklearn.ensemble import RandomForestRegressor  

model = RandomForestRegressor() 
model.fit(X_train, y_train) 
model.score(X_test, y_test)

ValueError: could not convert string to float: 'Nissan'

In [12]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

categorical_features = ["Make", "Colour", "Doors"] 
one_hot = OneHotEncoder()  
transformer = ColumnTransformer([("one_hot", one_hot,categorical_features)], remainder="passthrough")  

transformed_X = transformer.fit_transform(X)  
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [13]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [14]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]]) 
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [15]:
# Let's refit the model  

np.random.seed(42) 
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y ,test_size= 0.2)   
model.fit(X_train, y_train) 

In [16]:
model.score(X_test, y_test)

0.3235867221569877

## What if there are missing values  

1. Fill them with some value  
2. Remove the samples with missing data altogether  

In [17]:
car_sales_missing = pd.read_csv("car-sales-missing-data.csv")  
car_sales_missing.head() 

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"


In [18]:
car_sales_missing.isna().sum()

Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64

In [19]:
X = car_sales_missing.drop("Price", axis=1) 
y = car_sales_missing["Price"]  

X

Unnamed: 0,Make,Colour,Odometer,Doors
0,Toyota,White,150043.0,4.0
1,Honda,Red,87899.0,4.0
2,Toyota,Blue,,3.0
3,BMW,Black,11179.0,5.0
4,Nissan,White,213095.0,4.0
5,Toyota,Green,,4.0
6,Honda,,,4.0
7,Honda,Blue,,4.0
8,Toyota,White,60000.0,
9,,White,31600.0,4.0


In [20]:
categorical_features = ["Make", "Colour", "Doors"] 
one_hot = OneHotEncoder()  
transformer = ColumnTransformer([("one_hot", one_hot,categorical_features)], remainder="passthrough")  

transformed_X = transformer.fit_transform(X)  
transformed_X

<10x16 sparse matrix of type '<class 'numpy.float64'>'
	with 40 stored elements in Compressed Sparse Row format>

In [21]:
transformed_X

<10x16 sparse matrix of type '<class 'numpy.float64'>'
	with 40 stored elements in Compressed Sparse Row format>

### 1. Fill missing data with pandas   


In [22]:
car_sales_missing.dropna(inplace= True)

In [23]:
car_sales_missing.isna().sum()

Make        0
Colour      0
Odometer    0
Doors       0
Price       0
dtype: int64

In [24]:
len(car_sales_missing)

4

In [25]:
X = car_sales_missing.drop("Price", axis=1) 
y = car_sales_missing["Price"]  

X

Unnamed: 0,Make,Colour,Odometer,Doors
0,Toyota,White,150043.0,4.0
1,Honda,Red,87899.0,4.0
3,BMW,Black,11179.0,5.0
4,Nissan,White,213095.0,4.0


In [26]:
categorical_features = ["Make", "Colour", "Doors"] 
one_hot = OneHotEncoder()  
transformer = ColumnTransformer([("one_hot", one_hot,categorical_features)], remainder="passthrough")  

transformed_X = transformer.fit_transform(X)  
transformed_X

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00, 1.50043e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 8.78990e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 1.11790e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00, 2.13095e+05]])

### Picking up a machine learning algorithm/ estimator  

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [27]:
## Picking up a machine learning model for a regression problem 

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [28]:
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [29]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])

In [30]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [31]:
housing_df["target"] = housing["target"] 
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [32]:
housing_df = housing_df.drop("MedHouseVal", axis = 1)

KeyError: "['MedHouseVal'] not found in axis"

In [33]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [34]:
np.random.seed(42) 

X = housing_df.drop("target", axis = 1)   
y = housing_df["target"] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)  

from sklearn.linear_model import Ridge  

model = Ridge()  
model.fit(X_train, y_train) 

model.score(X_test, y_test)

0.5758549611440126

In [36]:
from sklearn.ensemble import RandomForestRegressor  

np.random.seed(42) 

X = housing_df.drop("target", axis = 1) 
y = housing_df["target"] 

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2)  

model = RandomForestRegressor()   
model.fit(X_train, y_train) 

model.score(X_test, y_test)

0.8059809073051385