In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

class FakeJobDetectorApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Fake Job Detection")
        self.geometry("1000x800")
        self.configure(bg='LightSkyBlue')

        self.filename = None
        self.X = None
        self.Y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.cls = None
        self.vectorizer = TfidfVectorizer(max_features=100, stop_words='english')  # Text Vectorizer

        self.setup_ui()

    def setup_ui(self):
        font = ('times', 16, 'bold')
        title = tk.Label(self, text='Fake Job Detection Using Random Forest Classifier')
        title.config(bg='greenyellow', fg='dodger blue', font=font, height=2, width=80)
        title.pack(pady=10)

        self.text = tk.Text(self, height=20, width=100)
        self.text.pack(pady=20)

        # Button layout
        button_frame = tk.Frame(self)
        button_frame.pack(pady=10)

        tk.Button(button_frame, text="Upload Dataset", command=self.upload).pack(side=tk.LEFT, padx=5)
        tk.Button(button_frame, text="Generate Train & Test Model", command=self.generate_model).pack(side=tk.LEFT, padx=5)
        tk.Button(button_frame, text="Run Random Forest", command=self.run_random_forest).pack(side=tk.LEFT, padx=5)
        tk.Button(button_frame, text="Detect Fake Jobs", command=self.predict_fake_jobs).pack(side=tk.LEFT, padx=5)
        tk.Button(button_frame, text="Exit", command=self.quit).pack(side=tk.LEFT, padx=5)

    def upload(self):
        self.filename = filedialog.askopenfilename(
            title="Select Fake Job Dataset",
            filetypes=(("CSV Files", "*.csv"), ("All Files", "*.*"))
        )
        if self.filename:
            self.text.insert(tk.END, f"{self.filename} loaded\n")
        else:
            self.text.insert(tk.END, "No file selected.\n")

    def generate_model(self):
        if not self.filename:
            self.text.insert(tk.END, "Please upload a dataset first.\n")
            return

        try:
            train = pd.read_csv(self.filename)

            # Ensure the dataset contains the required columns
            required_columns = ['telecommuting', 'ratio', 'character_count', 'fraudulent', 'text']
            if not set(required_columns).issubset(train.columns):
                raise ValueError("Dataset does not contain all required columns.")

            self.X, self.Y, self.X_train, self.X_test, self.y_train, self.y_test = self.traintest(train)
            self.text.insert(tk.END, "Train & Test Model Generated\n")
            self.text.insert(tk.END, f"Total Dataset Size: {len(self.X)}\n")
            self.text.insert(tk.END, f"Training Size: {len(self.X_train)}\n")
            self.text.insert(tk.END, f"Test Size: {len(self.X_test)}\n")
        except Exception as e:
            self.text.insert(tk.END, f"Error generating model: {str(e)}\n")

    def traintest(self, train):
        # Ensure necessary columns are in the dataset
        required_columns = ['telecommuting', 'ratio', 'character_count', 'fraudulent', 'text']
        if not set(required_columns).issubset(train.columns):
            raise ValueError("Dataset does not contain all required columns.")

        # Extract numerical features
        X_numeric = train[['telecommuting', 'ratio', 'character_count']]

        # Process the 'text' column into a numerical representation
        text_feature = train['text'].fillna('')  # Handle missing values
        X_text = self.vectorizer.fit_transform(text_feature).toarray()  # Convert to numerical form

        # Combine numerical and text features
        X_combined = np.concatenate([X_numeric, X_text], axis=1)

        Y = train['fraudulent']

        # Split into training and testing datasets
        X_train, X_test, y_train, y_test = train_test_split(
            X_combined, Y, test_size=0.33, random_state=53
        )

        return X_combined, Y, X_train, X_test, y_train, y_test

    def run_random_forest(self):
        if self.X_train is None or self.X_test is None or self.y_train is None or self.y_test is None:
            self.text.insert(tk.END, "Please generate Train & Test Model first.\n")
            return
        
        try:
            self.cls = RandomForestClassifier(random_state=53)
            self.cls.fit(self.X_train, self.y_train)  # Train the model
            y_pred = self.cls.predict(self.X_test)  # Make predictions

            # Calculate accuracy and other metrics
            accuracy = accuracy_score(self.y_test, y_pred) * 100
            self.text.insert(tk.END, f"Random Forest Accuracy: {accuracy:.2f}%\n")

            # Display additional metrics and confusion matrix
            self.display_prediction_results(y_pred)
        except Exception as e:
            self.text.insert(tk.END, f"Error running Random Forest: {str(e)}\n")

    def display_prediction_results(self, y_pred):
        f1 = f1_score(self.y_test, y_pred, average='weighted')
        precision = precision_score(self.y_test, y_pred, average='weighted')
        recall = recall_score(self.y_test, y_pred, average='weighted')
        confusion = confusion_matrix(self.y_test, y_pred)

        self.text.insert(tk.END, f"F1-Score: {f1:.4f}\n")
        self.text.insert(tk.END, f"Precision: {precision:.4f}\n")
        self.text.insert(tk.END, f"Recall: {recall:.4f}\n")
        self.text.insert(tk.END, f"Confusion Matrix:\n{confusion}\n")


    def predict_fake_jobs(self):
        try:
            test_filename = filedialog.askopenfilename(
                title="Select Test Dataset",
                filetypes=(("CSV Files", "*.csv"), ("All Files", "*.*"))
            )
            if not test_filename:
                self.text.insert(tk.END, "No test file selected.\n")
                return
            
            test_data = pd.read_csv(test_filename)

            # Ensure 'text' column is defined
            test_data['text'] = (
                test_data['title'].fillna('') + ' ' +
                test_data['description'].fillna('') + ' ' +
                test_data['requirements'].fillna('') + ' ' +
                test_data['company_profile'].fillna('') + ' ' +
                test_data['location'].fillna('') + ' ' +
                test_data['department'].fillna('') + ' ' +
                test_data['salary_range'].fillna('')
            )

            self.load_model_predict(test_data)
        except Exception as e:
            self.text.insert(tk.END, f"Error in prediction: {str(e)}\n")
    
    def load_model_predict(self, test_data):
        data['text'] = data[['title', 'description', 'requirements', 'company_info', 'location', 'department', 'salary_range']].astype(str).agg(' '.join, axis=1)
        print(data[['text']])
        try:
            if 'text' not in test_data.columns:
                raise ValueError("Test dataset must contain the 'text' column.")

            text_feature = test_data['text'].fillna('')  # Handle missing values
            X_text = self.vectorizer.transform(text_feature).toarray()  # Convert to numerical form

            # Extract numerical features
            X_numeric = test_data[['telecommuting', 'ratio', 'character_count']]
            X_combined = np.concatenate([X_numeric, X_text], axis=1)

            y_test = test_data['fraudulent']

            # Predict using the Random Forest model
            y_pred = self.cls.predict(X_combined)

            # Display results and save to CSV
            self.display_prediction_results(y_pred)
            test_data['fraud_prediction'] = y_pred
            test_data.to_csv("predictionoutput/testsetprediction.csv", index=False)

            self.text.insert(tk.END, "Predictions saved to 'predictionoutput/testsetprediction.csv'\n")
        except Exception as e:
            self.text.insert(tk.END, f"Error in load_model_predict: {str(e)}\n")

# Create and run the application
if __name__ == "__main__":
    app = FakeJobDetectorApp()
    app.mainloop()


In [None]:
    def predict_fake_jobs(self):
        try:
            test_filename = filedialog.askopenfilename(
                title="Select Test Dataset",
                filetypes=(("CSV Files", "*.csv"), ("All Files", "*.*"))
            )
            if not test_filename:
                self.text.insert(tk.END, "No test file selected.\n")
                return
            
            test_data = pd.read_csv(test_filename)

            # Ensure 'text' column is defined
            test_data['text'] = (
                test_data['title'].fillna('') + ' ' +
                test_data['description'].fillna('') + ' ' +
                test_data['requirements'].fillna('') + ' ' +
                test_data['company_profile'].fillna('') + ' ' +
                test_data['location'].fillna('') + ' ' +
                test_data['department'].fillna('') + ' ' +
                test_data['salary_range'].fillna('')
            )

            self.load_model_predict(test_data)
        except Exception as e:
            self.text.insert(tk.END, f"Error in prediction: {str(e)}\n")

 
            self.text.insert(tk.END, f"Error in load_model_predict: {str(e)}\n")
  
    def load_model_predict(self, test_data):
        try:
        # Define required numerical columns
            required_columns = ['telecommuting', 'ratio', 'character_count']

        # Check for missing numerical columns and provide default values
            missing_columns = [col for col in required_columns if col not in test_data.columns]
            for col in missing_columns:
                test_data[col] = 0  # Replace with a suitable default value

            text_feature = test_data['text'].fillna('')  # Handle missing values
            X_text = self.vectorizer.transform(text_feature).toarray()  # Convert to numerical form

            expected_text_features = self.vectorizer.get_feature_names_out().shape[0]

        # Ensure the numerical and text features count is correct
            X_numeric = test_data[required_columns]

            if X_text.shape[1] != expected_text_features:
                raise ValueError(f"Text features mismatch: Expected {expected_text_features}, got {X_text.shape[1]}")

            X_combined = np.concatenate([X_numeric, X_text], axis=1)

        # Check if the combined feature count matches the model's expected input size
            expected_total_features = self.cls.n_features_in_
            if X_combined.shape[1] != expected_total_features:
                raise ValueError(f"Feature count mismatch: Expected {expected_total_features}, got {X_combined.shape[1]}")

            self.y_test = test_data['fraudulent']  # Set ground truth labels

        # Predict using the Random Forest model
            y_pred = self.cls.predict(X_combined)

        # Display results and save to CSV
            self.display_prediction_results(y_pred)
            test_data['fraud_prediction'] = y_pred
            test_data.to_csv("predictionoutput/testsetprediction.csv", index=False)

            self.text.insert(tk.END, "Predictions saved to 'predictionoutput/testsetprediction.csv'\n")
        except Exception as e: