In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib

In [None]:
# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# Split data into features and targets
X_train = train_data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                           'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y_train = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                      'Dirtiness', 'Bumps', 'Other_Faults']]

In [None]:
y_train.head()

In [None]:
test_ids = test_data['id']
test_features = test_data.drop('id', axis=1)

In [None]:
# Initialize classifiers
rf_classifier = RandomForestClassifier()
lgbm_classifier = LGBMClassifier()
xgb_classifier = XGBClassifier()
catboost_classifier = CatBoostClassifier()
extratrees_classifier = ExtraTreesClassifier()

In [None]:
# Initialize multi-output classifiers
rf_multi_classifier = MultiOutputClassifier(rf_classifier, 
                                            n_jobs=-1)
lgbm_multi_classifier = MultiOutputClassifier(lgbm_classifier, 
                                              n_jobs=-1)
xgb_multi_classifier = MultiOutputClassifier(xgb_classifier, 
                                             n_jobs=-1)
catboost_multi_classifier = MultiOutputClassifier(catboost_classifier, 
                                                  n_jobs=-1)
extratrees_multi_classifier = MultiOutputClassifier(extratrees_classifier, 
                                                    n_jobs=-1)

In [None]:
# Perform cross-validation to evaluate models
rf_cv_scores = cross_val_score(rf_multi_classifier, X_train, y_train, 
                               cv=5, scoring='accuracy')
lgbm_cv_scores = cross_val_score(lgbm_multi_classifier, X_train, y_train, 
                                 cv=5, scoring='accuracy')
xgb_cv_scores = cross_val_score(xgb_multi_classifier, X_train, y_train, 
                                cv=5, scoring='accuracy')
catboost_cv_scores = cross_val_score(catboost_multi_classifier, X_train, y_train, 
                                     cv=5, scoring='accuracy')
extratrees_cv_scores = cross_val_score(extratrees_multi_classifier, X_train, y_train, 
                                       cv=5, scoring='accuracy')

In [None]:
# Print mean cross-validation scores
print("Random Forest CV Accuracy:", rf_cv_scores.mean())
print("LightGBM CV Accuracy:", lgbm_cv_scores.mean())
print("XGBoost CV Accuracy:", xgb_cv_scores.mean())
print("CatBoost CV Accuracy:", catboost_cv_scores.mean())
print("ExtraTrees CV Accuracy:", extratrees_cv_scores.mean())

In [None]:
# Choose the best model
best_model = max([
    (rf_cv_scores.mean(), 'Random Forest'),
    (lgbm_cv_scores.mean(), 'LightGBM'),
    (xgb_cv_scores.mean(), 'XGBoost'),
    (catboost_cv_scores.mean(), 'CatBoost'),
    (extratrees_cv_scores.mean(), 'ExtraTrees')
])

print("Best Model:", best_model[1])

In [None]:
# Train the best model on the entire training data
best_classifier = None
if best_model[1] == 'Random Forest':
    best_classifier = rf_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'LightGBM':
    best_classifier = lgbm_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'XGBoost':
    best_classifier = xgb_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'CatBoost':
    best_classifier = catboost_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'ExtraTrees':
    best_classifier = extratrees_multi_classifier.fit(X_train, y_train)

In [None]:
# Save the best model
model = joblib.dump(best_classifier, 'best_model.joblib')

In [None]:
# Make predictions
best_test_probs = best_classifier.predict_proba(test_features)

In [None]:
best_test_probs

In [None]:
# Generate submission file
submission_df = pd.DataFrame({'id': test_ids})
# Iterate over each target
for i, target in enumerate(y_train.columns):
    # Fetch the probability of the positive class for each target
    # Store the positive probabilities in the respective target column
    submission_df[target] = best_test_probs[i][:, 1]  # Probability of positive class

submission_df.to_csv('submission.csv', index=False)

In [None]:
import time
# Generate submission file
submission_df = pd.DataFrame({'id': test_ids})
# Iterate over each target
for i, target in enumerate(y_train.columns):
    # Fetch the probability of the positive class for each target
    # Store the positive probabilities in the respective target column
    submission_df[target] = best_test_probs[i][:, 1]  # Probability of positive class
    time.sleep(6)
    print(submission_df) 
submission_df.to_csv('submission.csv', index=False)

In [None]:
y_train.columns

In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from catboost import CatBoostClassifier

def load_train_file():
    train_file = filedialog.askopenfilename(title="Select Train File")
    if train_file:
        train_entry.delete(0, tk.END)
        train_entry.insert(0, train_file)
        messagebox.showinfo("File Loaded", "Train file loaded successfully!")
    else:
        messagebox.showerror("Error", "Please select a train file.")

def load_test_file():
    test_file = filedialog.askopenfilename(title="Select Test File")
    if test_file:
        test_entry.delete(0, tk.END)
        test_entry.insert(0, test_file)
        messagebox.showinfo("File Loaded", "Test file loaded successfully!")
    else:
        messagebox.showerror("Error", "Please select a test file.")

def train_and_predict():
    train_file = train_entry.get()
    test_file = test_entry.get()
    
    if not train_file or not test_file:
        messagebox.showerror("Error", "Please select both train and test files.")
        return
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    
    X_train = train_data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                               'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
    y_train = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                          'Dirtiness', 'Bumps', 'Other_Faults']]
    
    catboost_classifier = CatBoostClassifier()

    best_model = MultiOutputClassifier(catboost_classifier, n_jobs=-1).fit(X_train, y_train)

    test_ids = test_data['id']
    test_features = test_data.drop('id', axis=1)
    best_test_probs = best_model.predict_proba(test_features)

    submission_df = pd.DataFrame({'id': test_ids})
    for i, target in enumerate(train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']].columns):
        submission_df[target] = best_test_probs[i][:, 1]

    submission_df.to_csv('submission.csv', index=False)
    messagebox.showinfo("Info", "Prediction completed and submission file generated!")

root = tk.Tk()
root.title("Fault Classification Model")

train_frame = tk.Frame(root)
train_frame.pack(pady=10)

train_label = tk.Label(train_frame, text="Train File:")
train_label.grid(row=0, column=0, padx=5, pady=5)

train_entry = tk.Entry(train_frame, width=40)
train_entry.grid(row=0, column=1, padx=5, pady=5)

train_button = tk.Button(train_frame, text="Browse", command=load_train_file)
train_button.grid(row=0, column=2, padx=5, pady=5)

test_frame = tk.Frame(root)
test_frame.pack(pady=10)

test_label = tk.Label(test_frame, text="Test File:")
test_label.grid(row=0, column=0, padx=5, pady=5)

test_entry = tk.Entry(test_frame, width=40)
test_entry.grid(row=0, column=1, padx=5, pady=5)

test_button = tk.Button(test_frame, text="Browse", command=load_test_file)
test_button.grid(row=0, column=2, padx=5, pady=5)

predict_button = tk.Button(root, text="Predict", command=train_and_predict)
predict_button.pack(pady=5)

root.mainloop()
