# Data Preprocessing

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

### Importing Dataset

In [2]:
df = pd.read_csv('ship_fuel_consumption_by_size.csv')

### Data understanding

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Ship_Size                         10000 non-null  object 
 1   Engine_Power_kW                   10000 non-null  float64
 2   Speed_knots                       10000 non-null  float64
 3   Fuel_Consumption_tonnes_per_hour  10000 non-null  float64
dtypes: float64(3), object(1)
memory usage: 312.6+ KB


In [4]:
df.describe()

Unnamed: 0,Engine_Power_kW,Speed_knots,Fuel_Consumption_tonnes_per_hour
count,10000.0,10000.0,10000.0
mean,31517.728957,18.39373,169.911922
std,29039.084924,5.259232,239.595856
min,504.91,10.0,0.05
25%,5360.6075,13.9,3.8975
50%,20544.705,17.8,53.42
75%,49872.2075,22.525,180.105
max,99985.69,30.0,895.67


### Data Manipulation

#### Applying Feature scaling on the data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Ship_Size                         10000 non-null  object 
 1   Engine_Power_kW                   10000 non-null  float64
 2   Speed_knots                       10000 non-null  float64
 3   Fuel_Consumption_tonnes_per_hour  10000 non-null  float64
dtypes: float64(3), object(1)
memory usage: 312.6+ KB


In [6]:
columns_to_scale = ["Engine_Power_kW", "Speed_knots"]

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
st_scaler = StandardScaler()

In [9]:
df[columns_to_scale] = st_scaler.fit_transform(df[columns_to_scale])

In [10]:
df.describe()

Unnamed: 0,Engine_Power_kW,Speed_knots,Fuel_Consumption_tonnes_per_hour
count,10000.0,10000.0,10000.0
mean,2.4158450000000003e-17,-2.671641e-16,169.911922
std,1.00005,1.00005,239.595856
min,-1.068022,-1.596079,0.05
25%,-0.9008007,-0.8544888,3.8975
50%,-0.3778898,-0.1128986,53.42
75%,0.6320928,0.7855666,180.105
max,2.357904,2.206948,895.67


#### Applying One Hot Encoding to the "Ship_size" column, to categorize the column.

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
one_encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use drop='first' to avoid multicollinearity if needed

In [13]:
encoded_ship_size = one_encoder.fit_transform(df[['Ship_Size']])

In [14]:
encoded_columns = one_encoder.get_feature_names_out(['Ship_Size'])
encoded_df = pd.DataFrame(encoded_ship_size, columns=encoded_columns)

In [15]:
df = pd.concat([df, encoded_df], axis=1).drop(columns=['Ship_Size'])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Engine_Power_kW                   10000 non-null  float64
 1   Speed_knots                       10000 non-null  float64
 2   Fuel_Consumption_tonnes_per_hour  10000 non-null  float64
 3   Ship_Size_Medium                  10000 non-null  float64
 4   Ship_Size_Small                   10000 non-null  float64
 5   Ship_Size_Very Large              10000 non-null  float64
dtypes: float64(6)
memory usage: 468.9 KB


In [17]:
df.describe()

Unnamed: 0,Engine_Power_kW,Speed_knots,Fuel_Consumption_tonnes_per_hour,Ship_Size_Medium,Ship_Size_Small,Ship_Size_Very Large
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.4158450000000003e-17,-2.671641e-16,169.911922,0.2531,0.2433,0.2491
std,1.00005,1.00005,239.595856,0.434809,0.429096,0.432513
min,-1.068022,-1.596079,0.05,0.0,0.0,0.0
25%,-0.9008007,-0.8544888,3.8975,0.0,0.0,0.0
50%,-0.3778898,-0.1128986,53.42,0.0,0.0,0.0
75%,0.6320928,0.7855666,180.105,1.0,0.0,0.0
max,2.357904,2.206948,895.67,1.0,1.0,1.0


### Dividing dataset

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
training_col = ['Engine_Power_kW', 'Speed_knots', 'Ship_Size_Medium', 'Ship_Size_Small', 'Ship_Size_Very Large']
test_col = ['Fuel_Consumption_tonnes_per_hour']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df[training_col], df[test_col], test_size=0.2, random_state=42)

In [21]:
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("y_train.shape", y_train.shape)
print("y_test.shape", y_test.shape)

X_train.shape (8000, 5)
X_test.shape (2000, 5)
y_train.shape (8000, 1)
y_test.shape (2000, 1)


## Building ANN model for Predicting Fuel Consumption

In [22]:
import tensorflow as tf

In [23]:
from tensorflow.keras.layers import Dense, InputLayer

In [24]:
from tensorflow.keras import Sequential, Input

In [25]:
model = Sequential()

In [26]:
model.add(Input(shape = (5,)))
model.add(Dense(units = 8, activation='relu'))
model.add(Dense(units = 8, activation='relu'))
model.add(Dense(units = 1, activation='relu'))

In [27]:
model.compile(optimizer='adam', loss="mean_absolute_error", metrics='mean_absolute_error')

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 48        
                                                                 
 dense_1 (Dense)             (None, 8)                 72        
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 129
Trainable params: 129
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x15fd8ad2ec0>

In [30]:
y_pred = model.predict(X_test)



In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

In [32]:
abs_error = mean_absolute_error(y_test, y_pred)
sqr_error = mean_squared_error(y_test, y_pred)
rms_error = root_mean_squared_error(y_test, y_pred)

In [33]:
print("mean_absolute_error(y_test, y_pred)", abs_error)
print("mean_squared_error(y_test, y_pred)", sqr_error)
print("root_mean_squared_error(y_test, y_pred)", rms_error)

mean_absolute_error(y_test, y_pred) 2.299992345542908
mean_squared_error(y_test, y_pred) 15.864129144354877
root_mean_squared_error(y_test, y_pred) 3.9829799327080315


In [34]:
def predicting(Ship_Size, Speed_knots, Engine_Power_kW):
    # Scale the numeric values using the StandardScaler
    scaled_features = st_scaler.transform([[Engine_Power_kW, Speed_knots]])  # Combine numeric features for transformation
    Engine_Power_kW_scaled, Speed_knots_scaled = scaled_features[0]
    print("Engine_Power_kW_scaled, Speed_knots_scaled : ", Engine_Power_kW_scaled, Speed_knots_scaled)

    # Encode the categorical value using the OneHotEncoder
    encoded_ship_size = one_encoder.transform([[Ship_Size]])  # Transform the Ship_Size to encoded form
    print("encoded_ship_size : ", encoded_ship_size)

    # Combine all features into a single array for prediction
    input_features = np.concatenate([[Engine_Power_kW_scaled, Speed_knots_scaled], encoded_ship_size[0]])
    print("input_features : ", input_features)

    # Ensure input is 2D by wrapping it in another list
    input_features = np.expand_dims(input_features, axis=0)  # Convert to shape (1, num_features)
    print("input_features reshaped: ", input_features)

    # Predict fuel consumption using the model
    fuel = model.predict(input_features)[0]  # Model expects 2D array; unpack result if needed

    return fuel


In [35]:
Engine_Power_kW = 8185
Speed_knots = 13.5
Ship_Size = 'Medium'
answer = predicting(Ship_Size, Speed_knots, Engine_Power_kW)
print("answer : ", answer)

Engine_Power_kW_scaled, Speed_knots_scaled :  -0.8035341245220446 -0.9305493696681827
encoded_ship_size :  [[1. 0. 0.]]
input_features :  [-0.80353412 -0.93054937  1.          0.          0.        ]
input_features reshaped:  [[-0.80353412 -0.93054937  1.          0.          0.        ]]
answer :  [4.4757633]


