Decision Tree Model Developement

In [1]:
import pandas as pd
import numpy as np

# Define the number of rows you want
num_rows = 100  # Adjust as needed

# Create random data
np.random.seed(0)  # For reproducibility

# File 1: Converted Features Dataset
data1 = {
    "ID number": range(1, num_rows + 1),
    "Diagnosis": np.random.choice(['M', 'B'], num_rows),
    "radius": np.random.uniform(5.0, 30.0, num_rows),
    "texture": np.random.uniform(10.0, 40.0, num_rows),
    "perimeter": np.random.uniform(30.0, 200.0, num_rows),
    "area": np.random.uniform(500.0, 5000.0, num_rows),
    "smoothness": np.random.uniform(0.0, 0.2, num_rows),
    "compactness": np.random.uniform(0.0, 0.5, num_rows),
    "concavity": np.random.uniform(0.0, 0.5, num_rows),
    "concave points": np.random.uniform(0.0, 0.2, num_rows),
    "symmetry": np.random.uniform(0.0, 0.5, num_rows),
    "fractal dimension": np.random.uniform(0.05, 0.2, num_rows)
}
df1 = pd.DataFrame(data1)
df1.to_csv('converted_features_breast_cancer.csv', index=False)

# File 2: Normalized Features Dataset
data2 = {
    "ID number": range(1, num_rows + 1),
    "Diagnosis": np.random.choice(['M', 'B'], num_rows),
    "radius": np.random.uniform(5.0, 30.0, num_rows),
    "area": np.random.uniform(500.0, 5000.0, num_rows),
    "smoothness": np.random.uniform(0.0, 0.2, num_rows)
}
df2 = pd.DataFrame(data2)
df2.to_csv('normalized_features_breast_cancer.csv', index=False)

# File 3: Interaction Features Dataset
data3 = {
    "ID number": range(1, num_rows + 1),
    "Diagnosis": np.random.choice(['M', 'B'], num_rows),
    "radius_texture": np.random.uniform(5.0, 30.0, num_rows) * np.random.uniform(10.0, 40.0, num_rows),
    "perimeter_area": np.random.uniform(30.0, 200.0, num_rows) * np.random.uniform(500.0, 5000.0, num_rows),
    "compactness_smoothness": np.random.uniform(0.0, 0.5, num_rows) * np.random.uniform(0.0, 0.2, num_rows),
    "concavity_symmetry": np.random.uniform(0.0, 0.5, num_rows) * np.random.uniform(0.0, 0.5, num_rows)
}
df3 = pd.DataFrame(data3)
df3.to_csv('interaction_features_breast_cancer.csv', index=False)

# File 4: Selected Feature Set Dataset
data4 = {
    "ID number": range(1, num_rows + 1),
    "Diagnosis": np.random.choice(['M', 'B'], num_rows),
    "radius": np.random.uniform(5.0, 30.0, num_rows),
    "perimeter": np.random.uniform(30.0, 200.0, num_rows),
    "area": np.random.uniform(500.0, 5000.0, num_rows),
    "smoothness": np.random.uniform(0.0, 0.2, num_rows),
    "compactness": np.random.uniform(0.0, 0.5, num_rows),
    "concavity": np.random.uniform(0.0, 0.5, num_rows),
    "concave points": np.random.uniform(0.0, 0.2, num_rows),
    "symmetry": np.random.uniform(0.0, 0.5, num_rows),
    "radius_texture": np.random.uniform(5.0, 30.0, num_rows) * np.random.uniform(10.0, 40.0, num_rows),
    "perimeter_area": np.random.uniform(30.0, 200.0, num_rows) * np.random.uniform(500.0, 5000.0, num_rows)
}
df4 = pd.DataFrame(data4)
df4.to_csv('selected_feature_breast_cancer.csv', index=False)

# File 5: Converted and Normalized Features Dataset
data5 = {
    "ID number": range(1, num_rows + 1),
    "Diagnosis": np.random.choice(['M', 'B'], num_rows),
    "radius": np.random.uniform(5.0, 30.0, num_rows),
    "area": np.random.uniform(500.0, 5000.0, num_rows),
    "smoothness": np.random.uniform(0.0, 0.2, num_rows),
    "radius_normalized": (np.random.uniform(5.0, 30.0, num_rows) - 17.5) / 7.5,
    "area_normalized": (np.random.uniform(500.0, 5000.0, num_rows) - 2750.0) / 1500.0
}
df5 = pd.DataFrame(data5)
df5.to_csv('converted_normalized_features_breast_cancer.csv', index=False)


In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load datasets
datasets = {
    'converted_features_breast_cancer': pd.read_csv("converted_features_breast_cancer.csv"),
    'normalized_features_breast_cancer': pd.read_csv("normalized_features_breast_cancer.csv"),
    'interaction_features_breast_cancer': pd.read_csv("interaction_features_breast_cancer.csv"),
    'selected_feature_breast_cancer': pd.read_csv("selected_feature_breast_cancer.csv"),
    'converted_normalized_features_breast_cancer': pd.read_csv("converted_normalized_features_breast_cancer.csv")
}

accuracies = {}

# Iterate through datasets and calculate accuracy
for name, dataset in datasets.items():
    # Define feature columns based on the dataset
    if name == 'converted_features_breast_cancer':
        feature_cols = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension']
    elif name == 'normalized_features_breast_cancer':
        feature_cols = ['radius', 'area', 'smoothness']
    elif name == 'interaction_features_breast_cancer':
        feature_cols = ['radius_texture', 'perimeter_area', 'compactness_smoothness', 'concavity_symmetry']
    elif name == 'selected_feature_breast_cancer':
        feature_cols = ['radius', 'area', 'perimeter', 'smoothness', 'compactness', 'concavity', 'concave points', 'symmetry']
    elif name == 'converted_normalized_features_breast_cancer':
        feature_cols = ['radius', 'area', 'smoothness', 'radius_normalized', 'area_normalized']

    # Ensure all feature columns exist in the dataset
    missing_cols = [col for col in feature_cols if col not in dataset.columns]
    if missing_cols:
        print(f"Dataset '{name}' is missing columns: {missing_cols}")
        continue

    # Prepare data for training
    x = dataset[feature_cols]
    y = dataset['Diagnosis']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

    # Create and train the decision tree classifier
    clf = DecisionTreeClassifier()
    clf = clf.fit(x_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = clf.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)

    # Store the accuracy in the dictionary
    accuracies[name] = accuracy

# Print out the results
for dataset_name, accuracy in accuracies.items():
    print(f"{dataset_name} Accuracy: {accuracy:.2%}")


converted_features_breast_cancer Accuracy: 56.67%
normalized_features_breast_cancer Accuracy: 46.67%
interaction_features_breast_cancer Accuracy: 63.33%
selected_feature_breast_cancer Accuracy: 70.00%
converted_normalized_features_breast_cancer Accuracy: 50.00%
