In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("../backup/processed.csv", sep=";")
df['timestamps_UTC'] = pd.to_datetime(df['timestamps_UTC'])

In [None]:
df.describe()

- AirTemp shouldn't be over 65°C,
- WaterTemp shouldn't be over 100°C,
- OilTemp shouldn't be over 115°C.

It is the maximum accepted temperature of each fluid, else the engine should automatically shut down.
=> Should allow us for verifying if the reding is wrong (no engine shutoff) or if it is an overheating (engine shutoff)

In [None]:
df_raw = df.copy()

In [None]:
AirLimit = 65
WaterLimit = 100
OilLimit = 115

precision = 0.1

# Remove values under 0
df_raw = df_raw[df_raw['RS_E_InAirTemp_PC1'] > 0]
df_raw = df_raw[df_raw['RS_E_InAirTemp_PC1'] < AirLimit * (1 + precision)]

df_raw = df_raw[df_raw['RS_E_WatTemp_PC1'] > 0]
df_raw = df_raw[df_raw['RS_E_WatTemp_PC1'] < WaterLimit * (1 + precision)]

df_raw = df_raw[df_raw['RS_T_OilTemp_PC1'] > 0]
df_raw = df_raw[df_raw['RS_T_OilTemp_PC1'] < OilLimit * (1 + precision)]


# histogram of air temperature
bins = 80
df_raw['RS_E_InAirTemp_PC1'].hist(bins=bins)
df_raw['RS_T_OilTemp_PC1'].hist(bins=bins)
df_raw['RS_E_WatTemp_PC1'].hist(bins=bins)
plt.legend(['Air', 'Oil', 'Water'])
plt.show()

In [None]:
data = df_raw[df_raw['mapped_veh_id'] == 102]
data.reset_index(inplace=True)
data_train = data.copy()
# data = data.iloc[1000:3000]
data.reset_index(inplace=True)

# Random Forest

In [None]:
### lets see the threshold if we use normal univariate z-score cut off
threshold_max = data['RS_E_InAirTemp_PC1'].mean() + 3 * data['RS_E_InAirTemp_PC1'].std()
threshold_min = data['RS_E_InAirTemp_PC1'].mean() - 3 * data['RS_E_InAirTemp_PC1'].std()

# Create a new DataFrame containing values above the threshold
df_outliers_max = data[data['RS_E_InAirTemp_PC1'] > threshold_max]
df_outliers_min = data[data['RS_E_InAirTemp_PC1'] < threshold_min]
df_outliers_univar = pd.concat([df_outliers_min, df_outliers_max])

# Print mean and standard deviation
print(f"Mean: {data['RS_E_InAirTemp_PC1'].mean():.2f}")
print(f"Standard Deviation: {data['RS_E_InAirTemp_PC1'].std():.2f}")
print()

# Print the threshold in a pretty way
print(f"Threshold for outliers (mean + 3 * std): {threshold_max:.2f}")
print(f"Threshold for outliers (mean - 3 * std): {threshold_min:.2f}")
print()

# Display the first few rows of df_outliers_univar
print(f"Number of datapoints flagged as outliers: {df_outliers_univar.shape[0]} ")

In [None]:
# Create the scatter plot
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Plot all data points in blue
plt.scatter(data.index, data['RS_E_InAirTemp_PC1'], marker='o', color='b', alpha=0.5, label='Non-Outliers')

# Plot data points in df_outliers_univar in red
plt.scatter(df_outliers_univar.index, df_outliers_univar['RS_E_InAirTemp_PC1'], marker='o', color='r', alpha=0.5, label='Outliers')

# Add labels and title
plt.xlabel('Time')
plt.ylabel('Air Temperature')
plt.title('Air Temperature with Outliers over Time')

# Show a legend to differentiate between outliers and non-outliers
plt.legend()

# Show the plot
plt.grid(True)
plt.show()

In [None]:
# Extract the columns for the scatter plot
air_temp_pc1 = data['RS_E_InAirTemp_PC1']
air_temp_pc2 = data['RS_E_InAirTemp_PC2']

# Create the scatter plot
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.scatter(air_temp_pc1, air_temp_pc2, alpha=0.5)  # alpha controls transparency

# Set plot labels and title
plt.xlabel('Air Temperature PC1')
plt.ylabel('Air Temperature PC2')
plt.title('Scatter Plot Air Temperature: PC1 vs. PC2')

# Show the plot
plt.grid(True)
plt.show()

In [None]:
# Extract the columns for the scatter plot
air_temp_pc1 = data['RS_E_RPM_PC1']
air_temp_pc2 = data['RS_E_RPM_PC2']

# Create the scatter plot
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.scatter(air_temp_pc1, air_temp_pc2, alpha=0.5)  # alpha controls transparency

# Set plot labels and title
plt.xlabel('RS_E_RPM_PC1')
plt.ylabel('RS_E_RPM_PC1')
plt.title('Scatter Plot RPM: PC1 vs. PC2')

# Show the plot
plt.grid(True)
plt.show()

In [None]:
# Scatter plot of the RPM vs the OilPressure
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.scatter(data['RS_E_RPM_PC1'], data['RS_E_OilPress_PC1'], alpha=1)  # alpha controls transparency
plt.scatter(data['RS_E_RPM_PC2'], data['RS_E_OilPress_PC2'], alpha=1)  # alpha controls transparency
plt.xlabel('RPM')
plt.ylabel('Oil Pressure')
plt.legend(['PC1', 'PC2'])
plt.title('Scatter Plot: RPM vs. Oil Pressure')
plt.grid(True)
plt.show()


print(len(data[data['RS_E_OilPress_PC1'] == 0]))
print(len(data[data['RS_E_OilPress_PC2'] == 0]))

# Same plot but limit to OilPressure = 0
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.scatter(
    data[data['RS_E_RPM_PC1'] == 0]['RS_E_RPM_PC1'],
    data[data['RS_E_RPM_PC1'] == 0]['RS_E_OilPress_PC1'],
    alpha=1
)  # alpha controls transparency
plt.scatter(
    data[data['RS_E_RPM_PC2'] == 0]['RS_E_RPM_PC2'],
    data[data['RS_E_RPM_PC2'] == 0]['RS_E_OilPress_PC2'],
    alpha=1
)  # alpha controls transparency
plt.xlabel('RPM')
plt.ylabel('Oil Pressure')
plt.legend(['PC1', 'PC2'])
plt.title('Scatter Plot: RPM vs. Oil Pressure')
plt.grid(True)

# Bar plot of the frequency of OilPressure when RPM = 0
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.bar(
    data[data['RS_E_RPM_PC2'] == 0]['RS_E_OilPress_PC2'].value_counts().index,
    data[data['RS_E_RPM_PC2'] == 0]['RS_E_OilPress_PC2'].value_counts().values,
    alpha=1
)  # alpha controls transparency
plt.bar(
    data[data['RS_E_RPM_PC1'] == 0]['RS_E_OilPress_PC1'].value_counts().index,
    data[data['RS_E_RPM_PC1'] == 0]['RS_E_OilPress_PC1'].value_counts().values,
    alpha=1
)  # alpha controls transparency

plt.xlabel('Oil Pressure')
plt.ylabel('Frequency')
plt.legend(['PC2', 'PC1'])
plt.title('Bar Plot: Oil Pressure Frequency when RPM = 0')
plt.grid(True)
plt.show()

In [None]:
# Extract the columns for the scatter plot
oil_temp_pc1 = data['RS_T_OilTemp_PC1']
water_temp_pc1 = data['RS_E_WatTemp_PC1']

# Create the scatter plot
plt.figure(figsize=(10, 6))  # Optional: Set the figure size

plt.scatter(oil_temp_pc1, water_temp_pc1, alpha=0.5)  # alpha controls transparency

# Extract the columns for the scatter plot
oil_temp_pc2 = data['RS_T_OilTemp_PC2']
water_temp_pc2 = data['RS_E_WatTemp_PC2']

plt.scatter(oil_temp_pc2, water_temp_pc2, alpha=0.5)  # alpha controls transparency

# Show the plot
plt.xlabel('Oil Temperature')
plt.ylabel('Water Temperature')
plt.title('Scatter Plot Correlation between Oil and Water Temperature')
plt.legend(['PC1', 'PC2'])
plt.grid(True)
plt.show()

In [None]:
water_consumption = data['RS_E_InAirTemp_PC1']
gas_consumption = data['RS_E_InAirTemp_PC2']

# Create a scatter plot with points not in df_outliers
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.scatter(
    water_consumption[~data.index.isin(df_outliers_univar.index)],
    gas_consumption[~data.index.isin(df_outliers_univar.index)],
    alpha=0.5,
    label='Not Outliers',  # Label for non-outliers points
)

# Create a scatter plot with points in df_outliers (colored in red)
plt.scatter(
    water_consumption[data.index.isin(df_outliers_univar.index)],
    gas_consumption[data.index.isin(df_outliers_univar.index)],
    alpha=0.5,
    color='red',  # Color for outliers points
    label='Outliers',  # Label for outliers points
)

# Set plot labels and title
plt.xlabel('Water Consumption')
plt.ylabel('Gas Consumption')
plt.title('Scatter Plot: Water Consumption vs. Gas Consumption')

# Show a legend to differentiate between outliers and non-outliers
plt.legend()

# Show the plot
plt.grid(True)
plt.show()

In [None]:
def split_X_y(df, target_label):
    X = df.drop(target_label, axis=1)
    y = df[target_label]
    return X,y

In [None]:
X,y = split_X_y(data_train.drop(["timestamps_UTC", "time_difference"], axis=1),'RS_E_InAirTemp_PC1')

In [None]:
def train_random_forest_regressor(X, y):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train a Random Forest Regressor
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_scaled, y)

    # Predict the water consumption on test set
    y_pred = model.predict(X_scaled)

    # Calculate the residuals
    residuals = y - y_pred

    importances = model.feature_importances_

    # Create a dataframe for visualization
    importances_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    })

    # Sort the dataframe by importance
    feature_importances = importances_df.sort_values(by='Importance', ascending=False)


    return residuals,feature_importances

In [None]:
def find_outliers_z_score(df, residuals, z_score_cut_off):
    # Identify buildings with residuals more than 3 standard deviations above the mean
    residuals_mean = residuals.mean()
    residuals_std = residuals.std()
    outliers = df[residuals > residuals_mean + z_score_cut_off * residuals_std]
    return outliers

In [None]:
## For future optimization, we can remove columns that have zero feature importance
residuals,feature_importance = train_random_forest_regressor(X,y)
feature_importance

In [None]:
print(len(residuals))
print(len(data))

## get the mean of the entire dataset
mean_total = data['RS_E_InAirTemp_PC1'].mean()

##get the number of outliers detected
df_outliers = find_outliers_z_score(data, residuals, 3)
num_outliers = df_outliers.shape

## mean water consumption of outlier households
mean_outliers= df_outliers['RS_E_InAirTemp_PC1'].mean()

# Reset the index of df_outliers to ensure consistent indexing
df_outliers_reset = df_outliers.reset_index(drop=True)

# Use the index difference to select non-outlier rows from df_all_rows
df_no_outliers = data.drop(df_outliers_reset.index, axis=0)

# df_no_outliers now contains all rows except the outliers

## print mean of non outliers
mean_no_outliers = df_no_outliers['RS_E_InAirTemp_PC1'].mean()

print(f"Number of outliers detected: {num_outliers}")
print(f"Mean water consumption of entire dataset: {mean_total}")
print(f"Mean water consumption of outlier households: {mean_outliers}")
print(f"Mean water consumption without outliers: {mean_no_outliers}")

In [None]:
import matplotlib.pyplot as plt

# Assuming you have DataFrames df_raw and df_outliers
# Replace 'water_consumption' with the actual column name if it's different

# Create the scatter plot
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Scatter plot for non-outliers (color='blue')
plt.scatter(
    range(len(data)),  # x-values for non-outliers
    data['RS_E_InAirTemp_PC1'],  # y-values for non-outliers
    marker='o',
    color='blue',
    alpha=0.5,
    label='Non-Outliers'
)

# Scatter plot for outliers from df_outliers (color='red')
plt.scatter(
    df_outliers.index,  # x-values for outliers from df_outliers
    df_outliers['RS_E_InAirTemp_PC1'],  # y-values for outliers from df_outliers
    marker='o',
    color='red',
    alpha=0.5,
    label='Outliers'
)

# Add labels and title
plt.xlabel('Data Points')
plt.ylabel('Water Consumption')
plt.title('Distribution of Water Consumption with Outliers (Red)')

# Show a legend to differentiate between non-outliers and outliers
plt.legend()

# Show the plot
plt.show()

In [None]:
water_consumption = data['RS_E_InAirTemp_PC1']
gas_consumption = data['RS_E_InAirTemp_PC2']

# Create a scatter plot with points not in df_outliers
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.scatter(
    water_consumption[~data.index.isin(df_outliers.index)],
    gas_consumption[~data.index.isin(df_outliers.index)],
    alpha=0.5,
    label='Not Outliers',  # Label for non-outliers points
)

# Create a scatter plot with points in df_outliers (colored in red)
plt.scatter(
    water_consumption[data.index.isin(df_outliers.index)],
    gas_consumption[data.index.isin(df_outliers.index)],
    alpha=0.5,
    color='red',  # Color for outliers points
    label='Outliers',  # Label for outliers points
)

# Set plot labels and title
plt.xlabel('Water Consumption')
plt.ylabel('Gas Consumption')
plt.title('Scatter Plot: Water Consumption vs. Gas Consumption')

# Show a legend to differentiate between outliers and non-outliers
plt.legend()

# Show the plot
plt.grid(True)
plt.show()