In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [10]:
# Load the UCI mushroom dataset
def load_data():
    # Define dataset URL and column names
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
    column_names = ["class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment",
                   "gill-spacing",
                   "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring",
                   "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type",
                   "veil-color",
                   "ring-number", "ring-type", "spore-print-color", "population", "habitat"]

    # Load dataset from the URL
    return pd.read_csv(url, names=column_names)

In [11]:
# Convert categorical columns to numeric labels
def encode_data(data):
    label_encoders = {}
    for col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    return data, label_encoders

In [12]:
# Determine and print the importance of each feature using RandomForest
def feature_importance_analysis(X, y):
    print("Feature Importance Analysis:")
    print("-----------------------------")
    clf = RandomForestClassifier()
    clf.fit(X, y)
    feature_importances = clf.feature_importances_
    features = sorted(zip(X.columns, feature_importances), key=lambda x: x[1], reverse=True)
    for feature, importance in features:
        print(f"{feature}: {importance}")

In [13]:
# Perform PCA and print the explained variance of each component
def pca_analysis(X):
    print("\nPCA Analysis:")
    print("-------------")
    pca = PCA(n_components=5)
    principalComponents = pca.fit_transform(X)
    explained_variance = pca.explained_variance_ratio_
    for i, var in enumerate(explained_variance):
        print(f"Principal Component {i + 1}: {var * 100:.2f}%")
    return pca

In [14]:
# Print how much each feature contributes to the principal components
def pca_loadings_analysis(X, pca):
    print("\nPCA Loadings Analysis:")
    print("----------------------")
    components = pd.DataFrame(pca.components_, columns=X.columns, index=[f'PC-{i+1}' for i in range(pca.n_components_)]).T
    for i in range(pca.n_components_):
        print(f"\nLoadings for Principal Component {i+1}:")
        sorted_loadings = components[f'PC-{i+1}'].abs().sort_values(ascending=False)
        print(sorted_loadings)

In [17]:
def main():
    # Load and encode the dataset
    data = load_data()
    data, encoders = encode_data(data)

    X = data.drop('class', axis=1)
    y = data['class']

    # Feature importance analysis
    feature_importance_analysis(X, y)

    # PCA analysis
    pca = pca_analysis(X)

    # PCA loadings analysis
    pca_loadings_analysis(X, pca)


In [18]:
if __name__ == "__main__":
    main()


Feature Importance Analysis:
-----------------------------
gill-color: 0.13654163991473123
odor: 0.13458000457672026
spore-print-color: 0.1074852919448832
gill-size: 0.10460515056764882
ring-type: 0.06503611962747874
population: 0.061342797350018304
bruises: 0.05615214185654945
stalk-surface-above-ring: 0.055199069745673845
stalk-root: 0.048619540841300456
gill-spacing: 0.04416074930098593
stalk-surface-below-ring: 0.0427220176457448
habitat: 0.03264191195173173
stalk-shape: 0.027103901995400803
stalk-color-below-ring: 0.0240943035079537
stalk-color-above-ring: 0.019515134798310643
cap-color: 0.01208201200968431
ring-number: 0.010056330704467495
cap-surface: 0.008825119876590884
cap-shape: 0.005664845265453694
veil-color: 0.002499219372250277
gill-attachment: 0.0010726971464216
veil-type: 0.0

PCA Analysis:
-------------
Principal Component 1: 33.76%
Principal Component 2: 16.58%
Principal Component 3: 12.29%
Principal Component 4: 6.80%
Principal Component 5: 5.83%

PCA Loadings Analy