<a href="https://colab.research.google.com/github/Naman30903/Machine_learning/blob/main/Candidate_Elimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

def candidate_elimination(df):
    # Extract features and target from dataframe
    features = df.iloc[:, :-1].values
    target = df.iloc[:, -1].values
    feature_names = df.columns[:-1]

    # Number of attributes (columns excluding the target)
    n_attributes = features.shape[1]

    # Step 1: Initialize the specific hypothesis with the first positive example
    specific_h = None
    for i in range(len(features)):
        if target[i] == 'Yes':
            specific_h = features[i].copy()
            break

    if specific_h is None:
        return "No positive examples found", []

    # Initialize G (general boundary) with the most general hypotheses
    # Initially, all attributes are '?' (wildcard that matches any value)
    general_h = [['?' for _ in range(n_attributes)]]

    # Print initial hypotheses
    print("Initial Specific Hypothesis (S):", specific_h)
    print("Initial General Hypothesis (G):", general_h)
    print("-" * 50)

    # Step 2: Process each training example to refine S and G
    for i in range(len(features)):
        current_example = features[i]
        current_target = target[i]

        print(f"Training Example {i+1}: {current_example}, Class: {current_target}")

        # Case 1: Positive Example - Generalize S if needed
        if current_target == 'Yes':
            # Update specific hypothesis: If attribute values don't match, replace with '?'
            for j in range(n_attributes):
                if specific_h[j] != current_example[j]:
                    specific_h[j] = '?'

        # Case 2: Negative Example - Specialize G if needed
        else:  # current_target == 'No'
            # We need to make G more specific to exclude this negative example
            general_h_new = []

            for g in general_h:
                # Check if this general hypothesis covers the negative example
                if all(g[j] == '?' or g[j] == current_example[j] for j in range(n_attributes)):
                    # This general hypothesis incorrectly covers the negative example
                    # For each attribute where specific_h differs from the negative example
                    for j in range(n_attributes):
                        if specific_h[j] != '?' and specific_h[j] != current_example[j]:
                            # Create a new general hypothesis that's more specific
                            g_new = g.copy()
                            g_new[j] = specific_h[j]

                            # Add to new general hypotheses if it's consistent with specific_h
                            # and not already in the list
                            if g_new not in general_h_new:
                                is_consistent = True
                                for k in range(n_attributes):
                                    if specific_h[k] != '?' and g_new[k] != '?' and specific_h[k] != g_new[k]:
                                        is_consistent = False
                                        break
                                if is_consistent:
                                    general_h_new.append(g_new)
                else:
                    # This general hypothesis already excludes the negative example
                    general_h_new.append(g)

            # Update general hypothesis boundary
            general_h = general_h_new

        # After processing each example, print current state
        print("Current S:", specific_h)
        print("Current G:", general_h)
        print("-" * 50)

    # Convert numpy arrays to lists for better readability in final output
    specific_h = specific_h.tolist() if hasattr(specific_h, 'tolist') else specific_h

    # Create a more readable output with feature names
    specific_h_dict = {feature_names[i]: specific_h[i] for i in range(n_attributes)}
    general_h_dicts = [{feature_names[i]: g[i] for i in range(n_attributes)} for g in general_h]

    return specific_h_dict, general_h_dicts

def demonstrate_with_sample_data():
    """
    Demonstrates the Candidate Elimination algorithm with a sample dataset.
    """
    # Sample data: "Enjoy Sport" dataset - Whether to enjoy sport based on various conditions
    sample_data = """Sky,AirTemp,Humidity,Wind,Water,Forecast,EnjoySport
Sunny,Warm,Normal,Strong,Warm,Same,Yes
Sunny,Warm,High,Strong,Warm,Same,Yes
Rainy,Cold,High,Strong,Warm,Change,No
Sunny,Warm,High,Strong,Cool,Change,Yes"""

    with open('enjoy_sport.csv', 'w') as f:
        f.write(sample_data)

    # Read and display the dataset
    df = pd.read_csv('enjoy_sport.csv')
    print("DATASET:")
    print(df)
    print("\n" + "="*60 + "\n")

    print("RUNNING CANDIDATE ELIMINATION ALGORITHM...")
    print("="*60 + "\n")

    # Run the algorithm
    specific_h, general_h = candidate_elimination(df)

    # Display final results
    print("\nFINAL RESULTS:")
    print("="*60)

    print("\nFinal Specific Hypothesis (S):")
    print("This is the most specific hypothesis that fits all positive examples.")
    for feature, value in specific_h.items():
        print(f"  {feature}: {value}")

    print("\nFinal General Hypothesis Boundary (G):")
    print("These are the most general hypotheses that don't cover any negative examples.")
    for i, g in enumerate(general_h, 1):
        print(f"  Hypothesis {i}:")
        for feature, value in g.items():
            print(f"    {feature}: {value}")

    # Explanation of results
    print("\nEXPLANATION:")
    print("="*60)
    print("The Version Space is the set of all hypotheses that lie between S and G.")
    print("Any hypothesis in this space will correctly classify the training examples.")
    print("- If a feature has a specific value in S, it must have that value for a positive example.")
    print("- The G boundary ensures we don't include hypotheses that cover negative examples.")
    print("- A '?' represents a 'don't care' value that can match any attribute value.")

def demonstrate_with_user_data(file_path):
    """
    Demonstrates the Candidate Elimination algorithm with user-provided data.

    Args:
        file_path: Path to the CSV file containing training data
    """
    try:
        # Read and display the dataset
        df = pd.read_csv(file_path)
        print("DATASET:")
        print(df)
        print("\n" + "="*60 + "\n")

        print("RUNNING CANDIDATE ELIMINATION ALGORITHM...")
        print("="*60 + "\n")

        # Run the algorithm
        specific_h, general_h = candidate_elimination(df)

        # Display final results
        print("\nFINAL RESULTS:")
        print("="*60)

        print("\nFinal Specific Hypothesis (S):")
        print("This is the most specific hypothesis that fits all positive examples.")
        for feature, value in specific_h.items():
            print(f"  {feature}: {value}")

        print("\nFinal General Hypothesis Boundary (G):")
        print("These are the most general hypotheses that don't cover any negative examples.")
        for i, g in enumerate(general_h, 1):
            print(f"  Hypothesis {i}:")
            for feature, value in g.items():
                print(f"    {feature}: {value}")

        # Explanation of results
        print("\nEXPLANATION:")
        print("="*60)
        print("The Version Space is the set of all hypotheses that lie between S and G.")
        print("Any hypothesis in this space will correctly classify the training examples.")
        print("- If a feature has a specific value in S, it must have that value for a positive example.")
        print("- The G boundary ensures we don't include hypotheses that cover negative examples.")
        print("- A '?' represents a 'don't care' value that can match any attribute value.")

    except Exception as e:
        print(f"Error processing the file: {e}")

if __name__ == "__main__":
    # By default, run with sample data
    demonstrate_with_sample_data()

    # Uncomment below to run with your own data file
    # demonstrate_with_user_data('your_data_file.csv')

DATASET:
     Sky AirTemp Humidity    Wind Water Forecast EnjoySport
0  Sunny    Warm   Normal  Strong  Warm     Same        Yes
1  Sunny    Warm     High  Strong  Warm     Same        Yes
2  Rainy    Cold     High  Strong  Warm   Change         No
3  Sunny    Warm     High  Strong  Cool   Change        Yes


RUNNING CANDIDATE ELIMINATION ALGORITHM...

Initial Specific Hypothesis (S): ['Sunny' 'Warm' 'Normal' 'Strong' 'Warm' 'Same']
Initial General Hypothesis (G): [['?', '?', '?', '?', '?', '?']]
--------------------------------------------------
Training Example 1: ['Sunny' 'Warm' 'Normal' 'Strong' 'Warm' 'Same'], Class: Yes
Current S: ['Sunny' 'Warm' 'Normal' 'Strong' 'Warm' 'Same']
Current G: [['?', '?', '?', '?', '?', '?']]
--------------------------------------------------
Training Example 2: ['Sunny' 'Warm' 'High' 'Strong' 'Warm' 'Same'], Class: Yes
Current S: ['Sunny' 'Warm' '?' 'Strong' 'Warm' 'Same']
Current G: [['?', '?', '?', '?', '?', '?']]
---------------------------------

In [None]:
import pandas as pd
import numpy as np
def candidate_elimination()