In [50]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Import the dataset

data = pd.read_csv("/workspaces/individual_project_4/src/startup.csv")
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

In [51]:
X.shape  # this is the features of the dataset.

(50, 4)

In [52]:
y.shape  # this is the outcome of the dataset

(50,)

In [53]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [54]:
data.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [55]:
# One Hot Encoding categorical data ie the 3
# states are being made into numerical categorical data.

# Assuming X is your dataset
# Initialize LabelEncoder
labelencoder = LabelEncoder()

# Apply label encoding to the categorical column at index 3
X[:, 3] = labelencoder.fit_transform(X[:, 3])

# Initialize ColumnTransformer with OneHotEncoder
column_transformer = ColumnTransformer(
    transformers=[
        (
            "encoder",
            OneHotEncoder(),
            [3],
        )  # Apply OneHotEncoder to the column at index 3
    ],
    remainder="passthrough",  # Keep the remaining columns as they are
)

# Apply the transformations to the dataset
X = column_transformer.fit_transform(X)

In [56]:
X.shape

(50, 6)

In [57]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [58]:
# Fitting Multiple Linear Regression to the Training set
regression = LinearRegression()
regression.fit(X_train, y_train)

In [59]:
X_test

array([[0.0, 1.0, 0.0, 66051.52, 182645.56, 118148.2],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [0.0, 1.0, 0.0, 27892.92, 84710.77, 164470.71],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 72107.6, 127864.55, 353183.81],
       [0.0, 0.0, 1.0, 20229.59, 65947.93, 185265.1],
       [0.0, 0.0, 1.0, 61136.38, 152701.92, 88218.23],
       [0.0, 1.0, 0.0, 73994.56, 122782.75, 303319.26],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [60]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

In [61]:
# Predicting the Test set results
y_pred = regression.predict(X_test)
y_pred

array([103015.20159795, 132582.27760816, 132447.73845175,  71976.09851258,
       178537.48221057, 116161.24230167,  67851.69209676,  98791.73374687,
       113969.43533014, 167921.06569552])

In [62]:
# creating a data frame of the y test

df = pd.DataFrame(data=y_test, columns=["y_test"])
df["y_pred"] = y_pred
df

Unnamed: 0,y_test,y_pred
0,103282.38,103015.201598
1,144259.4,132582.277608
2,146121.95,132447.738452
3,77798.83,71976.098513
4,191050.39,178537.482211
5,105008.31,116161.242302
6,81229.06,67851.692097
7,97483.56,98791.733747
8,110352.25,113969.43533
9,166187.94,167921.065696


In [63]:
# Predicting the sigle observation results.
# Here 1,0,0 represents that the state is New York

a = [1, 0, 0, 160349, 134321, 401400]
b = np.array(a)
b = b.reshape(1, -1)
y_pred_single_obs = regression.predict(b)
round(float(y_pred_single_obs), 2)

185691.21

In [64]:
# Model Evaluation

r2_score(y_test, y_pred)

0.9347068473282546


"""This calculates the coefficient of determination or the r^2 of the model. This can give a score between -1 and 1. 
Scores closer to -1 giving a negative impact on the model and scores closer to 1 give a positive impact to the model. 
In our case, we have 0.93 which is close to 1 which indicates that we have a pretty good model."""

In [65]:
# Saving the model


joblib.dump(regression, "multiple_regression_model.pkl")

['multiple_regression_model.pkl']


"""scikit-learn has their own model persistence method we will use: joblib. 
This is more efficient to use with scikit-learn models due to it being better at handling larger numpy arrays that may 
be stored in the models."""

In [66]:
NewYork = 1
California = 0
Florida = 0
RnD_Spend = 160349
Administration_Spend = 134321
Marketing_Spend = 401400
pred_args = [
    NewYork,
    California,
    Florida,
    RnD_Spend,
    Administration_Spend,
    Marketing_Spend,
]
pred_args_arr = np.array(pred_args)
pred_args_arr = pred_args_arr.reshape(1, -1)
mul_reg = open("multiple_regression_model.pkl", "rb")
ml_model = joblib.load(mul_reg)
model_prediction = ml_model.predict(pred_args_arr)

round(float(model_prediction), 2)

185691.21