In [13]:
import pandas as pd

# Define the paths to your datasets
sensor_data_path = r'telemetry.csv'
failure_data_path = r'failures.csv'

# Load the sensor data
sensor_data = pd.read_csv(sensor_data_path)

# Load the failure data
failure_data = pd.read_csv(failure_data_path)

# Convert datetime columns to datetime type for both datasets
sensor_data['datetime'] = pd.to_datetime(sensor_data['datetime'])
failure_data['datetime'] = pd.to_datetime(failure_data['datetime'])

sensor_data.set_index('datetime', inplace=True)

# Sort by datetime to ensure rolling windows work correctly
sensor_data.sort_index(inplace=True)

# Choose a window size, e.g., 24 hours
window_size = '24h'

# Create rolling features for each sensor
for column in ['volt', 'rotate', 'pressure', 'vibration']:
    sensor_data[f'{column}_mean'] = sensor_data.groupby('machineID')[column].transform(lambda x: x.rolling(window_size).mean())
    sensor_data[f'{column}_std'] = sensor_data.groupby('machineID')[column].transform(lambda x: x.rolling(window_size).std())

# For each standard deviation column, fill NaN values with the average of available standard deviations for that sensor
for column in ['volt_std', 'rotate_std', 'pressure_std', 'vibration_std']:
    # Calculate the average standard deviation for the column, excluding NaN values
    avg_std = sensor_data[column].mean()
    
    # Fill NaN values in the standard deviation column with this average standard deviation
    sensor_data[column].fillna(avg_std, inplace=True)

# Initialize a column for labels
sensor_data['failure_within_48h'] = 0

# For each failure, mark the preceding 48 hours as positive examples
for index, row in failure_data.iterrows():
    start_time = row['datetime'] - pd.Timedelta(hours=48)
    end_time = row['datetime']
    machine_id = row['machineID']
    
    sensor_data.loc[(sensor_data.index > start_time) & (sensor_data.index <= end_time) & (sensor_data['machineID'] == machine_id), 'failure_within_48h'] = 1

# Split the data based on a date. For example, use the last 20% of the dates as the test set.
split_date = sensor_data.index.max() - pd.Timedelta(days=365 * 0.2)  # Adjust based on your dataset's date range

train_data = sensor_data[sensor_data.index < split_date]
test_data = sensor_data[sensor_data.index >= split_date]

import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Features and target variable
X_train = train_data.drop(['failure_within_48h', 'machineID'], axis=1)
y_train = train_data['failure_within_48h']
X_test = test_data.drop(['failure_within_48h', 'machineID'], axis=1)
y_test = test_data['failure_within_48h']

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Since we've applied SMOTE, convert the resampled datasets to DMatrix for XGBoost
dtrain_resampled = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'max_depth': 6,  # Depth of each tree
    'eta': 0.3,  # Learning rate
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'auc',  # Evaluation metric
    'nthread': 4  # Number of cores to use
}

num_boost_round = 100

# Train the model on the resampled (balanced) dataset
bst_resampled = xgb.train(params, dtrain_resampled, num_boost_round, evals=[(dtest, 'test')], early_stopping_rounds=10)

# Predict the probabilities of failure
y_pred_proba_resampled = bst_resampled.predict(dtest)

# Convert probabilities to percentages
y_pred_percentages = [f"{x * 100:.2f}%" for x in y_pred_proba_resampled]

# Print the first few predictions to check
print("First few prediction percentages:", y_pred_percentages[:5])

# Convert probabilities to binary predictions using a threshold, e.g., 0.5
y_pred_resampled = [1 if x > 0.5 else 0 for x in y_pred_proba_resampled]

# Calculate and print accuracy, precision, recall, and F1 score
print("Accuracy:", accuracy_score(y_test, y_pred_resampled))
print("Precision:", precision_score(y_test, y_pred_resampled))
print("Recall:", recall_score(y_test, y_pred_resampled))
print("F1 Score:", f1_score(y_test, y_pred_resampled))

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_resampled)
print("Confusion Matrix:")
print(conf_matrix)

results_df = pd.DataFrame({
    'datetime': test_data.index,
    'machineID': test_data['machineID'],
    'prediction': y_pred_resampled,
    'confidence': y_pred_percentages  # Confidence scores as percentages
})

results_df.to_csv('prediction_results.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sensor_data[column].fillna(avg_std, inplace=True)


[0]	test-auc:0.90512
[1]	test-auc:0.90958
[2]	test-auc:0.91607
[3]	test-auc:0.92073
[4]	test-auc:0.92363
[5]	test-auc:0.92402
[6]	test-auc:0.92461
[7]	test-auc:0.92539
[8]	test-auc:0.92606
[9]	test-auc:0.92578
[10]	test-auc:0.92623
[11]	test-auc:0.92571
[12]	test-auc:0.92558
[13]	test-auc:0.92573
[14]	test-auc:0.92534
[15]	test-auc:0.92475
[16]	test-auc:0.92543
[17]	test-auc:0.92512
[18]	test-auc:0.92496
[19]	test-auc:0.92507
[20]	test-auc:0.92473
First few prediction percentages: ['2.72%', '15.04%', '77.43%', '7.42%', '7.57%']
Accuracy: 0.8736052481460354
Precision: 0.20445103857566765
Recall: 0.886031184696994
F1 Score: 0.33223832916428075
Confusion Matrix:
[[147631  21448]
 [   709   5512]]
