In [1]:
import pandas as pd

df = pd.read_csv('measures_v2.csv')

df.rename(columns={'pm': 'motor_temperature'}, inplace=True)

# Fill missing values using linear interpolation
df.interpolate(method='linear', inplace=True)

# Fill any remaining missing values using backward filling
df.bfill(inplace=True)

# Calculate first and third quartiles
Q1 = df['motor_temperature'].quantile(0.25)
Q3 = df['motor_temperature'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows that is outside the bound range
df = df[(df['motor_temperature'] >= lower_bound) & (df['motor_temperature'] <= upper_bound)]


print("Data shape after cleaning:", df.shape)


Data shape after cleaning: (1330816, 13)


In [16]:
# Function to create lag features
def create_lag_features(dataframe, column, num_lags=3):
    for lag in range(1, num_lags + 1):
        dataframe[f'{column}_lag_{lag}'] = dataframe[column].shift(lag)
    return dataframe

# Add lag features for the 'motor_temperature' column
df = create_lag_features(df, 'motor_temperature', num_lags=3)

# Calculate rolling mean of 'torque' over a window of 3 rows
df['torque_roll_mean'] = df['torque'].rolling(window=3).mean()

# Remove rows with any NaN values (caused by lag or rolling operations)
df.dropna(inplace=True)

print("Data shape after cleaning", df.shape)

# Save the cleaned DataFrame to a CSV file
df.to_csv("processed_data.csv", index=False)
print("Cleaned data saved to 'processed_data.csv'")



Data shape after cleaning (1330810, 17)
Cleaned data saved to 'processed_data.csv'


In [3]:
from sklearn.preprocessing import StandardScaler

target_col = 'motor_temperature'
feature_cols = [col for col in df.columns if col != target_col]

X = df[feature_cols].values
y = df[target_col].values

scaler = StandardScaler()
scaler.fit(X)

X_scaled = scaler.transform(X)


In [4]:
import numpy as np

n = len(df)
train_end = int(n * 0.7) # Define the end index for training (70% of the data)
val_end = int(n * 0.85) # Define the end index for validation (85% of the data)

X_train = X_scaled[:train_end]
y_train = y[:train_end]

X_val = X_scaled[train_end:val_end]
y_val = y[train_end:val_end]

X_test = X_scaled[val_end:]
y_test = y[val_end:]

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)


Training set shape: (931569, 16)
Validation set shape: (199622, 16)
Test set shape: (199622, 16)


In [26]:
# Baseline Model: Linear Regression Model

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

lr_model = LinearRegression()

lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_val)

lr_mse = mean_squared_error(y_val, lr_preds)

print("Linear Regression Validation MSE:", lr_mse)



Linear Regression Validation MSE: 0.07835347393695728


In [28]:
# Baseline Model: Random Forest Model

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import time

# Set a sample size for training to speed up training
sample_size = 5000

# Randomly select a subset
if X_train.shape[0] > sample_size:
    indices = np.random.choice(X_train.shape[0], sample_size, replace=False)
    X_train_sample = X_train[indices]
    y_train_sample = y_train[indices]
else:
    X_train_sample = X_train
    y_train_sample = y_train

# Create a Random Forest model with specified parameters
rf_model = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=1, random_state=42)


start_time = time.time()
rf_model.fit(X_train_sample, y_train_sample)
end_time = time.time()
print("Training time: {:.2f} seconds".format(end_time - start_time))


rf_preds = rf_model.predict(X_val)

rf_mse = mean_squared_error(y_val, rf_preds)
print("Random Forest Validation MSE:", rf_mse)


Training time: 0.23 seconds
Random Forest Validation MSE: 0.09650606719444026


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import numpy as np
import time

# XGBoost Model

xgb_model = xgb.XGBRegressor(
    n_estimators=100,    
    max_depth=5,         
    learning_rate=0.1,   
    random_state=42,     
    n_jobs=1            
)

xgb_model.fit(X_train, y_train)


xgb_preds = xgb_model.predict(X_val)


xgb_mse = mean_squared_error(y_val, xgb_preds)
print("XGBoost Validation MSE:", xgb_mse)

# SVR Model

sample_size = 5000

if X_train.shape[0] > sample_size:
    indices = np.random.choice(X_train.shape[0], sample_size, replace=False)
    X_train_sub = X_train[indices]
    y_train_sub = y_train[indices]
else:
    X_train_sub = X_train
    y_train_sub = y_train


svr_model = SVR(
    kernel='rbf',
    C=1.0,       
    gamma=0.01  
)

svr_model.fit(X_train_sub, y_train_sub)


svr_preds = svr_model.predict(X_val)


svr_mse = mean_squared_error(y_val, svr_preds)
print("SVR Validation MSE:", svr_mse)


XGBoost Validation MSE: 0.08526683527803415
SVR Validation MSE: 0.7013052862571234
