In [7]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

df = pd.read_csv('1000_Companies.csv')
X = df.iloc[:, :-1] #grabs all rows,except last column
y = df.iloc[:, 4] #grabs last col as label or what we're testing 
print(X.head())
print(y.head())

   R&D Spend  Administration  Marketing Spend       State
0  165349.20       136897.80        471784.10    New York
1  162597.70       151377.59        443898.53  California
2  153441.51       101145.55        407934.54     Florida
3  144372.41       118671.85        383199.62    New York
4  142107.34        91391.77        366168.42     Florida
0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64


In [58]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,1000.0,1000.0,1000.0,1000.0
mean,81668.9272,122963.897612,226205.058419,119546.164656
std,46537.567891,12613.927535,91578.393542,42888.633848
min,0.0,51283.14,0.0,14681.4
25%,43084.5,116640.68485,150969.5846,85943.198543
50%,79936.0,122421.61215,224517.88735,117641.4663
75%,124565.5,129139.118,308189.808525,155577.107425
max,165349.2,321652.14,471784.1,476485.43


## Experiment 1

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
cols2trans = [3]
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), cols2trans)], remainder='passthrough')
X_counts = columnTransformer.fit_transform(X)
print(X)
print(X.shape)
print(X_counts.shape) #notice extra cols

     R&D Spend  Administration  Marketing Spend       State
0    165349.20     136897.8000     471784.10000    New York
1    162597.70     151377.5900     443898.53000  California
2    153441.51     101145.5500     407934.54000     Florida
3    144372.41     118671.8500     383199.62000    New York
4    142107.34      91391.7700     366168.42000     Florida
5    131876.90      99814.7100     362861.36000    New York
6    134615.46     147198.8700     127716.82000  California
7    130298.13     145530.0600     323876.68000     Florida
8    120542.52     148718.9500     311613.29000    New York
9    123334.88     108679.1700     304981.62000  California
10   101913.08     110594.1100     229160.95000     Florida
11   100671.96      91790.6100     249744.55000  California
12    93863.75     127320.3800     249839.44000     Florida
13    91992.39     135495.0700     252664.93000  California
14   119943.24     156547.4200     256512.92000     Florida
15   114523.61     122616.8400     26177

In [42]:
print(X_counts[:2,:])

[[0.0000000e+00 0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05
  4.7178410e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05
  4.4389853e+05]]


In [43]:
#Score is good
from sklearn.linear_model import LinearRegression
Xn = X.iloc[:,[0,1,2]] 
reg = LinearRegression().fit(Xn, y)
print(reg.score(Xn, y))
print(reg.coef_)
print(reg.intercept_)

0.9498636456185335
[0.55393228 1.02663516 0.08057015]
-70157.32512037757


## Experiment 2

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

numeric_features = ['R&D Spend', 'Administration','Marketing Spend']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(2))])

categorical_features = ['State']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('poly', PolynomialFeatures(2))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LinearRegression(fit_intercept=True))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model TRAIN score: %.3f" % clf.score(X_train, y_train))
print("model TEST score: %.3f" % clf.score(X_test, y_test))

model TRAIN score: 0.969
model TEST score: 0.985


In [50]:
#Visualization of the pipeline
from sklearn import set_config
set_config(display='diagram')
clf

In [51]:
#see list of param:
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__imputer', 'preprocessor__num__scaler', 'preprocessor__num__poly', 'preprocessor__num__imputer__add_indicator', 'preprocessor__num__imputer__copy', 'preprocessor__num__imputer__fill_value', 'preprocessor__num__imputer__missing_values', 'preprocessor__num__imputer__strategy', 'preprocessor__num__imputer__verbose', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'preprocessor__num__poly__degree', 'preprocessor__num__poly__include_bias', 'preprocessor__num__poly__interaction_only', 'preprocessor__num__poly__order', 'preprocessor__cat__

In [52]:
#more fun: tune up a model if you want

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
#     'classifier__C': [0.1, 1.0, 10, 100],
    'classifier__fit_intercept': [True, False],
    'preprocessor__num__poly__degree': [1,2,3,4],
}

grid_search = GridSearchCV(clf, param_grid, cv=2)
grid_search.fit(X_train, y_train)

print(("best TEST score from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best TEST score from grid search: 0.959


## Saving the model

In [54]:
#First, save test data:
X_test.to_csv('X_test.csv',index=False)
y_test.to_csv('y_test.csv',index=False)

In [55]:
#then save model:
from joblib import dump, load
dump(clf, 'ins_linreg.joblib') 

['ins_linreg.joblib']

In [56]:
#Test: load and predict
clf2 = load('ins_linreg.joblib') 
print(clf2.score(X_test,y_test))

0.9847022409819033
