In [149]:
# Imports section
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from numpy import asarray
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import ShuffleSplit

# Part 1. Loading the dataset

In [150]:
# Using pandas load the dataset (load remotely, not locally)
df = pd.read_csv('https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv')


In [151]:
# Output the first 15 rows of the data
df.head(15)

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
0,469,647,624474.3
1,403,694,577961.0
2,302,975,619684.7
3,779,916,1460449.0
4,901,18,43257.26
5,545,637,712463.4
6,660,519,700696.0
7,143,869,271826.0
8,89,461,89198.03
9,294,776,477021.0


In [152]:
# Display a summary of the table information (number of datapoints, etc.)
len(df)

1000

In [153]:
df.describe()

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
count,1000.0,1000.0,1000.0
mean,500.5,471.53,508611.1
std,288.819436,288.482872,447483.8
min,1.0,1.0,16.11429
25%,250.75,226.75,129826.7
50%,500.5,459.5,382718.2
75%,750.25,710.25,760321.1
max,1000.0,1000.0,1972127.0


# Part 2. Splitting the dataset

In [154]:
# Take the pandas dataset and split it into our features (X) and label (y)

features = ["Mols KCL", "Size nm^3"]
label = ["Temperature °C"]

In [155]:
x = df[features].values
y = df[label].values

In [156]:
# x , y

In [157]:
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .1, random_state = 0)

In [158]:
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((900, 2), (100, 2), (900, 1), (100, 1))

# Part 3. Perform a Linear Regression

In [159]:
# Use sklearn to train a model on the training set
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [160]:
# Create a sample datapoint and predict the output of that sample with the trained model

y_pred = model.predict(x_test)
y_pred

array([[ 389.61986015],
       [ 575.21612064],
       [1114.44411221],
       [ 249.88117485],
       [ 272.60958484],
       [ 300.17801125],
       [ 203.48481326],
       [ 853.56557591],
       [ 232.20107385],
       [ 281.72863101],
       [ 476.74695771],
       [1065.18995783],
       [ 782.22069437],
       [ 383.68987485],
       [ 229.37542796],
       [1036.51433101],
       [ 258.79486236],
       [ 597.33782993],
       [ 472.34656507],
       [ 551.17261323],
       [1234.08067822],
       [ 410.31480796],
       [ 554.46613542],
       [1211.09824566],
       [ 392.92403359],
       [ 999.17844871],
       [ 215.42657808],
       [ 529.3687714 ],
       [ 529.88816997],
       [ 386.26900228],
       [ 409.44180246],
       [ 145.90250563],
       [1275.30404849],
       [ 603.72326741],
       [1206.32003184],
       [ 434.93207673],
       [ 710.60275556],
       [ 918.95506319],
       [1051.92538328],
       [ 464.51967653],
       [ 204.19578745],
       [ 242.043

In [161]:
model.predict([[647, 6.244743e+05]])

array([[459.43732195]])

In [162]:
# predict with r2 score
r2_score(y_test, y_pred)


0.7201592080149567

In [163]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means

# I got a r2 score of 72% 
This means that my training model is 72% accurate

In [164]:
# Extract the coefficents and intercept from the model and write an equation for your h(x) using LaTeX

coefficient_values = model.coef_

df_coefficients = pd.DataFrame(coefficient_values, columns=features).T

df_coefficients.columns = ['coefficient']

df_coefficients

Unnamed: 0,coefficient
Mols KCL,-0.766313
Size nm^3,0.000799


In [165]:
model.coef_ , model.intercept_

(array([[-0.76631279,  0.00079873]]), array([456.45436197]))

In [166]:
# equation for your h(x) using LaTeX

# Part 4. Use Cross Validation

In [167]:
# Use the cross_val_score function to repeat your experiment across many shuffles of the data

# model = svm.SVC(kernel='linear', C=1, random_state=0)

scores = cross_val_score(model, x_train, y_train, cv=2)
scores

# Report on their finding and their significance

array([0.68052641, 0.70016562])

In [168]:
n_samples = x.shape[0]
cv = ShuffleSplit(n_splits = 5, test_size = 0.3, random_state = 0)
cross_val_score(model, x, y, cv = cv)

array([0.71021351, 0.68958158, 0.70019154, 0.69820215, 0.67451604])

# Part 5. Using Polynomial Regression

In [171]:
# Using the PolynomialFeatures library perform another regression on an augmented dataset of degree 2

trans = PolynomialFeatures(degree = 2)
df = trans.fit_transform(df)
print(df)

# Report on the metrics and output the resultant equation as you did in Part 3.

[[1.00000000e+00 1.00000000e+00 4.69000000e+02 ... 1.63244155e+17
  1.57560700e+20 1.52075117e+23]
 [1.00000000e+00 1.00000000e+00 4.03000000e+02 ... 1.60885184e+17
  1.33984678e+20 1.11582021e+23]
 [1.00000000e+00 1.00000000e+00 3.02000000e+02 ... 3.65048694e+17
  2.32015482e+20 1.47463024e+23]
 ...
 [1.00000000e+00 1.00000000e+00 7.91000000e+02 ... 5.48661034e+15
  8.95770940e+18 1.46247961e+22]
 [1.00000000e+00 1.00000000e+00 7.69000000e+02 ... 2.30658416e+17
  3.62246081e+20 5.68902820e+23]
 [1.00000000e+00 1.00000000e+00 9.19000000e+02 ... 1.46791546e+17
  2.75280023e+20 5.16236072e+23]]
