In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Load the data
data = pd.read_excel("/content/TimetoVentingTimex.xlsx")

# View the data
data.tail()

Unnamed: 0,TestVolume (litres),Final Result Code (1=Pass),Fill time (s) / 1000,Stabilisation Time(s)/1000,Measuring Time (s)/1000,Venting time(s)/1000
11071,0.1,1,1000,25000,2000,2000
11072,0.1,1,1000,25000,2000,2000
11073,0.1,1,1000,25000,2000,2000
11074,0.1,1,1000,25000,2000,2000
11075,0.1,1,1000,25000,2000,2000


In [None]:
# Check the shape of the dataset
data.shape

(11076, 6)

In [None]:

# Check the column names of the dataset
data.dtypes

TestVolume (litres)           float64
Final Result Code (1=Pass)      int64
Fill time (s) / 1000            int64
Stabilisation Time(s)/1000      int64
Measuring Time (s)/1000         int64
Venting time(s)/1000            int64
dtype: object

In [None]:
# Count the number of observations per result
data['Venting time(s)/1000'].value_counts()

2000     6191
12000    1530
20000    1305
10000    1292
40000     758
Name: Venting time(s)/1000, dtype: int64

In [None]:
# Split the data int X and y
X = data.drop(['Venting time(s)/1000'], axis=1)
y = data[['Venting time(s)/1000']]

In [None]:
# Split the data for training and testing at a ratio of 80/20
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.2)

In [None]:
print(y_train)

       Venting time(s)/1000
2434                  10000
3869                  12000
10824                  2000
1112                   2000
1389                  10000
...                     ...
10817                  2000
5343                  20000
5255                  20000
10628                  2000
10870                  2000

[8860 rows x 1 columns]


In [None]:
print(X_train)

       TestVolume (litres)  ...  Measuring Time (s)/1000
2434                   0.2  ...                    20000
3869                   0.4  ...                    20000
10824                  0.1  ...                     2000
1112                   0.1  ...                     2000
1389                   0.2  ...                    20000
...                    ...  ...                      ...
10817                  0.1  ...                     2000
5343                   0.8  ...                    20000
5255                   0.8  ...                    20000
10628                  0.1  ...                     2000
10870                  0.1  ...                     2000

[8860 rows x 5 columns]


In [None]:
np.isnan(data.any())

TestVolume (litres)           False
Final Result Code (1=Pass)    False
Fill time (s) / 1000          False
Stabilisation Time(s)/1000    False
Measuring Time (s)/1000       False
Venting time(s)/1000          False
dtype: bool

In [None]:
np.isfinite(data.all())

TestVolume (litres)           True
Final Result Code (1=Pass)    True
Fill time (s) / 1000          True
Stabilisation Time(s)/1000    True
Measuring Time (s)/1000       True
Venting time(s)/1000          True
dtype: bool

In [None]:
X_train = X_train.astype(float)

In [None]:
X_train.drop(X_train.columns[np.isnan(X_train).any()], axis=1)

Unnamed: 0,TestVolume (litres),Final Result Code (1=Pass),Fill time (s) / 1000,Stabilisation Time(s)/1000,Measuring Time (s)/1000
2434,0.2,1.0,10000.0,60000.0,20000.0
3869,0.4,1.0,12000.0,60000.0,20000.0
10824,0.1,1.0,1000.0,25000.0,2000.0
1112,0.1,1.0,2000.0,25000.0,2000.0
1389,0.2,1.0,10000.0,60000.0,20000.0
...,...,...,...,...,...
10817,0.1,1.0,1000.0,25000.0,2000.0
5343,0.8,1.0,18000.0,90000.0,20000.0
5255,0.8,1.0,18000.0,90000.0,20000.0
10628,0.1,1.0,1000.0,25000.0,2000.0


In [None]:
X_train.dtypes

TestVolume (litres)           float64
Final Result Code (1=Pass)    float64
Fill time (s) / 1000          float64
Stabilisation Time(s)/1000    float64
Measuring Time (s)/1000       float64
dtype: object

In [None]:
y_train.dtypes

Venting time(s)/1000    int64
dtype: object

In [None]:
X_train.describe()

Unnamed: 0,TestVolume (litres),Final Result Code (1=Pass),Fill time (s) / 1000,Stabilisation Time(s)/1000,Measuring Time (s)/1000
count,8860.0,8860.0,8860.0,8860.0,8860.0
mean,1.743363,2.680135,11353.085779,43340.897291,15027.268623
std,3.309437,3.75451,12642.878955,24157.281739,14971.746812
min,0.1,1.0,1000.0,1000.0,50.0
25%,0.1,1.0,2000.0,25000.0,2000.0
50%,0.2,1.0,9600.0,25000.0,9625.0
75%,0.8,1.0,18000.0,60000.0,20000.0
max,10.0,13.0,93000.0,129400.0,100000.0


In [None]:
np.where(X_train.values >= np.finfo(np.float64).max)

(array([], dtype=int64), array([], dtype=int64))

In [None]:
X_train.fillna(X_train.mean(), inplace=True)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8860, 5)
(8860, 1)
(2216, 5)
(2216, 1)


In [None]:
# Train a logistic regression model
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
model.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# Run prediction and print acuracy score
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6944945848375451


In [None]:
# Run a test prediction
model.predict(np.array([[0.1,1,1000,25000,1000]]))

array([2000])

In [None]:
# Save the model (serialize)
import pickle
pickle.dump(model, open("TimetoVenting.pkl", "wb"))

In [None]:
# Relaod the model  (deseralize)
model_pk = pickle.load(open("TimetoVenting.pkl", "rb"))

In [None]:
# Rerun predictions
model_pk.predict(np.array([[0.8,1,5000,1000,1000]]))

array([2000])