In [2]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

# Custom transformer for random imputation
class RandomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        self.values = {}
        for col in self.columns:
            self.values[col] = X[col].dropna().values
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            non_missing_values = self.values[col]
            X[col] = X[col].apply(lambda x: np.random.choice(non_missing_values) if pd.isnull(x) else x)
        return X

# Custom transformer for outlier detection and capping using IQR
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        self.bounds = {}
        for col in self.columns:
            Q1 = X[col].quantile(0.25)  # 25th percentile
            Q3 = X[col].quantile(0.75)  # 75th percentile
            IQR = Q3 - Q1  # Interquartile Range
            lower_bound = Q1 - 1.5 * IQR  # Lower bound for outliers
            upper_bound = Q3 + 1.5 * IQR  # Upper bound for outliers
            self.bounds[col] = (lower_bound, upper_bound)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower_bound, upper_bound = self.bounds[col]
            # Cap outliers to the lower and upper bounds
            X[col] = np.clip(X[col], lower_bound, upper_bound)
        return X

class PCAComparisonApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Data Preprocessing with/without PCA")
        self.root.geometry("1000x600")
        self.root.configure(bg="#1e272e")

        # Main frames
        self.left_frame = tk.Frame(root, bg="#1e272e", width=300)
        self.left_frame.pack(side=tk.LEFT, fill=tk.Y, padx=10, pady=10)

        self.right_frame = tk.Frame(root, bg="#2f3640")
        self.right_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True, padx=10, pady=10)

        # Title
        tk.Label(self.left_frame, text="PCA Comparison", font=("Arial", 16, "bold"), fg="#00cec9", bg="#1e272e").pack(pady=10)

        # Buttons
        self.load_button = tk.Button(self.left_frame, text="Load CSV", command=self.load_csv, width=20, bg="#0984e3", fg="white", font=("Arial", 10, "bold"))
        self.load_button.pack(pady=10)

        self.preprocess_without_pca_button = tk.Button(self.left_frame, text="Preprocess Without PCA", command=self.preprocess_without_pca, state=tk.DISABLED, width=20, bg="#d63031", fg="white", font=("Arial", 10, "bold"))
        self.preprocess_without_pca_button.pack(pady=10)

        self.preprocess_with_pca_button = tk.Button(self.left_frame, text="Preprocess With PCA", command=self.preprocess_with_pca, state=tk.DISABLED, width=20, bg="#fdcb6e", fg="white", font=("Arial", 10, "bold"))
        self.preprocess_with_pca_button.pack(pady=10)

        self.compare_button = tk.Button(self.left_frame, text="Compare Accuracies", command=self.compare_accuracies, state=tk.DISABLED, width=20, bg="#6c5ce7", fg="white", font=("Arial", 10, "bold"))
        self.compare_button.pack(pady=10)

        self.export_button = tk.Button(self.left_frame, text="Export Preprocessed Data", command=self.export_to_csv, state=tk.DISABLED, width=20, bg="#00b894", fg="white", font=("Arial", 10, "bold"))
        self.export_button.pack(pady=10)

        # Right side: Results and graph
        self.result_label = tk.Label(self.right_frame, text="", font=("Arial", 12), fg="#00cec9", bg="#2f3640")
        self.result_label.pack(pady=10)

        self.figure = plt.Figure(figsize=(6, 4), dpi=100)
        self.ax = self.figure.add_subplot(111)
        self.canvas = FigureCanvasTkAgg(self.figure, self.right_frame)
        self.canvas.get_tk_widget().pack()

        # Variables
        self.df = None
        self.preprocessed_data = None
        self.accuracy_without_pca = None
        self.accuracy_with_pca = None

    def load_csv(self):
        file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
        if file_path:
            try:
                self.df = pd.read_csv(file_path)
                messagebox.showinfo("Success", "CSV file loaded successfully!")
                self.log(f"CSV loaded: {file_path}")

                # Enable buttons after loading CSV
                self.preprocess_without_pca_button["state"] = tk.NORMAL
                self.preprocess_with_pca_button["state"] = tk.NORMAL
            except Exception as e:
                messagebox.showerror("Error", f"Failed to load CSV: {str(e)}")

    def preprocess_without_pca(self):
        self.accuracy_without_pca = self.preprocess_data(use_pca=False)
        self.result_label["text"] = f"Accuracy without PCA: {self.accuracy_without_pca:.2f}"

        # Enable compare and export buttons
        self.compare_button["state"] = tk.NORMAL
        self.export_button["state"] = tk.NORMAL

    def preprocess_with_pca(self):
        self.accuracy_with_pca = self.preprocess_data(use_pca=True)
        self.result_label["text"] = f"Accuracy with PCA: {self.accuracy_with_pca:.2f}"

        # Enable compare and export buttons
        self.compare_button["state"] = tk.NORMAL
        self.export_button["state"] = tk.NORMAL

    def preprocess_data(self, use_pca):
        # Preprocess the data using the pipeline logic from the first two code files
        y = self.df['Attrition'].map({'Yes': 1, 'No': 0})
        X = self.df.drop(columns=['Attrition', 'Over18', 'EmployeeCount', 'StandardHours'])
        
        numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
        nominal_cols = ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']
        ordinal_cols = ['BusinessTravel']
        ordinal_categories = [['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']]
        
        numerical_pipeline = Pipeline([
            ('random_imputer', RandomImputer(columns=numerical_cols)),
            ('outlier_capper', OutlierCapper(columns=numerical_cols)),
            ('scaler', StandardScaler())
        ])

        nominal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('one_hot', OneHotEncoder(handle_unknown='ignore'))
        ])

        ordinal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('ordinal', OrdinalEncoder(categories=ordinal_categories))
        ])

        preprocessor = ColumnTransformer([
            ('num', numerical_pipeline, numerical_cols),
            ('nom', nominal_pipeline, nominal_cols),
            ('ord', ordinal_pipeline, ordinal_cols)
        ])

        # Create pipeline with or without PCA
        if use_pca:
            pca_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('pca', PCA(n_components=2)),
                ('classifier', RandomForestClassifier())
            ])
        else:
            pca_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', RandomForestClassifier())
            ])

        # Train model
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        pca_pipeline.fit(X_train, y_train)
        y_pred = pca_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # After preprocessing, store the transformed data in self.preprocessed_data
        self.preprocessed_data = pca_pipeline.named_steps['preprocessor'].transform(X)

        return accuracy

    def compare_accuracies(self):
        if self.accuracy_without_pca is None or self.accuracy_with_pca is None:
            messagebox.showerror("Error", "Please preprocess the data before comparing accuracies.")
            return

        # Plot the comparison
        self.ax.clear()
        self.ax.bar(['Without PCA', 'With PCA'], [self.accuracy_without_pca, self.accuracy_with_pca], color=['#FF7F7F', '#7FBFFF'])
        self.ax.set_title("Comparison of Model Accuracy")
        self.ax.set_ylabel("Accuracy")
        self.canvas.draw()

    def export_to_csv(self):
        if self.preprocessed_data is not None:
            file_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV files", "*.csv")])
            if file_path:
                # Convert preprocessed data to DataFrame for exporting
                preprocessed_df = pd.DataFrame(self.preprocessed_data)
                preprocessed_df.to_csv(file_path, index=False)
                messagebox.showinfo("Success", f"Preprocessed data saved to {file_path}")
            else:
                messagebox.showerror("Error", "Failed to save CSV.")
        else:
            messagebox.showerror("Error", "No preprocessed data to save.")

    def log(self, message):
        print(message)

# Initialize the Tkinter window
root = tk.Tk()
app = PCAComparisonApp(root)
root.mainloop()


CSV loaded: C:/Users/PMLS/Desktop/5th semester/Cloud Computing/Data Processing/Project/HR-Employee-Attrition_with_missing_values.csv
