In [1]:
# Step 1: Upload the dataset to Colab
from google.colab import files
import pandas as pd
import pickle


In [18]:
# Upload the file
uploaded = files.upload()

Saving cleaned_machine_failure_data.csv to cleaned_machine_failure_data.csv


In [19]:
# Load the dataset
file_name = list(uploaded.keys())[0]  # Get the uploaded file name
data = pd.read_csv(file_name)

In [20]:
# Step 2: Explore the dataset
# Display first few rows
print("First five rows of the dataset:")
print(data.head())

First five rows of the dataset:
   Min_Temp  Max_Temp  Leakage  Electricity  Fail_tomorrow
0      13.4      22.9      0.6     7.624853              0
1       7.4      25.1      0.0     7.624853              0
2      12.9      25.7      0.0     7.624853              0
3       9.2      28.0      0.0     7.624853              0
4      17.5      32.3      1.0     7.624853              0


In [21]:

# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())
# Analyze target distribution
if 'Fail_tomorrow' in data.columns:
    print("\nTarget column distribution (Fail_tomorrow):")
    print(data['Fail_tomorrow'].value_counts())
else:
    print("\nTarget column 'Fail_tomorrow' not found. Please check the dataset columns.")
    print("Available columns:", data.columns)


Missing values in each column:
Min_Temp         637
Max_Temp         322
Leakage            0
Electricity        0
Fail_tomorrow      0
dtype: int64

Target column distribution (Fail_tomorrow):
Fail_tomorrow
0    110316
1     31877
Name: count, dtype: int64


In [23]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from google.colab import files

# Ensure the dataset is loaded
if 'Fail_tomorrow' in data.columns:
    # Features (X) and Target (y)
    X = data.drop(columns=['Fail_tomorrow'])
    y = data['Fail_tomorrow']

    # Convert 'Date' column to datetime objects and extract numerical features
    if 'Date' in X.columns:
        X['Date'] = pd.to_datetime(X['Date'], errors='coerce')  # Handle invalid dates
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X = X.drop(columns=['Date'])  # Remove the original 'Date' column

    # Convert categorical features to numerical using Label Encoding
    for col in X.select_dtypes(include=['object']).columns:  # Select object type columns
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])  # Fit and transform on the column

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Decision Tree Classifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="binary", pos_label=1)  # Changed pos_label to 1
    print("\nModel Training Completed!")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Save the trained model
    model_filename = "trained_model.pkl"
    with open(model_filename, "wb") as file:
        pickle.dump(model, file)

    print(f"\nTrained model saved as '{model_filename}'")

    # Download the trained model (only applicable in Google Colab)
    try:
        files.download(model_filename)
    except:
        print("Download is only available in Google Colab.")
else:
    print("\nCannot train the model. Target column 'Fail_tomorrow' is missing.")



Model Training Completed!
Accuracy: 0.74
F1 Score: 0.41

Trained model saved as 'trained_model.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>