## Import the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Import the dataset

In [2]:
dataset = pd.read_csv('./data/1000_Companies.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
X[:5, :]

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

In [5]:
y[:5]

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94])

## Preprocess data

SVM (Support Vector Machine) requires both X and y to be Standardized. Otherwise, it won't work properly.

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

X_preprocessor = ColumnTransformer(
    remainder='passthrough',        # Keep data that isn't processed by the transformer
    transformers=[
        ('numerical', StandardScaler(), [0, 1, 2]),
        ('categorical', OneHotEncoder(drop='first'), [3])
    ])

y_preprocessor = StandardScaler()

In [8]:
X_train_encoded = np.array(X_preprocessor.fit_transform(X_train))
X_test_encoded = np.array(X_preprocessor.transform(X_test))

display(X_train_encoded[:5, :])

array([[ 1.11613906,  0.52659346,  1.09509759,  0.        ,  0.        ],
       [-1.0800658 , -0.59994177, -1.0511859 ,  0.        ,  0.        ],
       [-1.19702383, -0.65993497, -1.16548539,  0.        ,  0.        ],
       [-1.08861687, -0.60432802, -1.0595426 ,  1.        ,  0.        ],
       [ 0.69789023,  0.31205425,  0.68635586,  0.        ,  0.        ]])

In [9]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

y_train = y_preprocessor.fit_transform(y_train)
y_test = y_preprocessor.transform(y_test)

display(y_train[:5])

array([[ 0.99493775],
       [-1.0053885 ],
       [-1.1119151 ],
       [-1.01317691],
       [ 0.61399235]])

## Train the model

Optimize by adjusting: kernel, epsilon

In [10]:
from sklearn.svm import SVR
regressor = SVR(kernel='linear', epsilon=0.01)
regressor.fit(X_train_encoded, y_train.ravel())

SVR(epsilon=0.01, kernel='linear')

## Evaluate model

In [11]:
train_accuracy = regressor.score(X_train_encoded, y_train)
print(f'Train accuracy = {round(train_accuracy, 3) * 100}%')

test_accuracy = regressor.score(X_test_encoded, y_test)
print(f'Test accuracy = {round(test_accuracy, 3) * 100}%')

Train accuracy = 88.8%
Test accuracy = 99.9%


## Predict

In [12]:
data = [
    [200000.0, 200000.0, 200000.0, 'New York'],
    [600000.0, 0.0, 0.0, 'California'],
    [0.0, 600000.0, 0.0, 'Florida'],
    [0.0, 0.0, 600000.0, 'New York']
]

In [13]:
data_scaled = X_preprocessor.transform(data)
y_pred_scaled = regressor.predict(data_scaled)
y_pred = y_preprocessor.inverse_transform(y_pred_scaled.reshape(-1, 1))

display(y_pred)

array([[218607.01447353],
       [485148.88612489],
       [119753.97298291],
       [ 52604.59135514]])