In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [38]:
'''
Question 6:   Write a Python program to:
● Load the Iris Dataset
● Train a Decision Tree Classifier using the Gini criterion
● Print the model’s accuracy and feature importances
'''

# importing the dataset
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target variable (species)
feature_names = iris.feature_names

# Split the dataset into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a Decision Tree Classifier with Gini criterion
dtree = DecisionTreeClassifier(criterion='gini', random_state=1)

# Train the model
dtree.fit(X_train, y_train)

# Make predictions
y_pred = dtree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print feature importances
importances = dtree.feature_importances_
for i, feature_name in enumerate(feature_names):
    print(f"Feature '{feature_name}': {importances[i]}")

Accuracy: 0.9666666666666667
Feature 'sepal length (cm)': 0.007520367662419047
Feature 'sepal width (cm)': 0.01880091915604763
Feature 'petal length (cm)': 0.07584565922951901
Feature 'petal width (cm)': 0.8978330539520143


In [37]:
'''
Question 7:  Write a Python program to:
● Load the Iris Dataset
● Train a Decision Tree Classifier with max_depth=3 and compare its accuracy to a fully-grown tree.
'''

# importing the dataset
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target variable (species)

# Split the dataset into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a Decision Tree Classifier with Gini criterion
dtree = DecisionTreeClassifier(criterion='entropy') # Full Grown tree
dtree_purn = DecisionTreeClassifier(criterion='entropy', max_depth= 3,) # Max Depth of 3

# Train the model
dtree.fit(X_train, y_train)
dtree_purn.fit(X_train, y_train)

# Make predictions
y_pred = dtree.predict(X_test)
y_pred_purn = dtree.predict(X_test)

# Evaluate the model
accuracy_purn = accuracy_score(y_test, y_pred_purn)
accuracy = accuracy_score(y_test, y_pred)

# Print the results
print(f"Accuracy for complete tree: {accuracy}")
print(f"Accuracy for complete tree wuth max depth 3 : {accuracy_purn}")

Accuracy for complete tree: 0.9666666666666667
Accuracy for complete tree wuth max depth 3 : 0.9666666666666667


In [43]:
'''
Question 8: Write a Python program to:
● Load the Boston Housing Dataset
● Train a Decision Tree Regressor
● Print the Mean Squared Error (MSE) and feature importances
'''

# importing the dataset
from sklearn.datasets import fetch_california_housing

# Load the California Housing dataset
housing = fetch_california_housing()
X = housing.data  # Features
y = housing.target  # Target variable (house value)
feature_names = housing.feature_names

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=1)

# Train the model
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print feature importances
importances = regressor.feature_importances_
for i, feature_name in enumerate(feature_names):
    print(f"Feature '{feature_name}': {importances[i]}")

Mean Squared Error: 0.495235205629094
Feature 'MedInc': 0.5285090936963706
Feature 'HouseAge': 0.05188353710616045
Feature 'AveRooms': 0.05297496833123543
Feature 'AveBedrms': 0.02866045788296106
Feature 'Population': 0.030515676373806224
Feature 'AveOccup': 0.13083767753210346
Feature 'Latitude': 0.09371656401749287
Feature 'Longitude': 0.08290202505986989


In [52]:
'''
Question 9: Write a Python program to:
● Load the Iris Dataset
● Tune the Decision Tree’s max_depth and min_samples_split using GridSearchCV
● Print the best parameters and the resulting model accuracy
'''

# importing the dataset
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target variable (species)


# Split the dataset into training and testing sets 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


# Define the parameter grid
param_grid = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10, 15, 20]
}

# Create a Decision Tree classifier
dtree = DecisionTreeClassifier(random_state=1)

# Set up GridSearchCV
grid_search = GridSearchCV(dtree, param_grid, cv=5, scoring='accuracy', refit=True)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
accuracy = best_model.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Accuracy on Test Set:", accuracy)


Best Parameters: {'max_depth': 4, 'min_samples_split': 2}
Accuracy on Test Set: 0.9666666666666667


In [6]:
'''
Question 10: Imagine you’re working as a data scientist for a healthcare company that wants to predict whether a patient has a certain disease. You have a large dataset with mixed data types and some missing values. Explain the step-by-step process you would follow to:
● Handle the missing values
● Encode the categorical features
● Train a Decision Tree model
● Tune its hyperparameters
● Evaluate its performance And describe what business value this model could provide in the real-world setting.
'''

# importing the dataset
df = sns.load_dataset('titanic')

#Handle the missing values

#Droping embark town as there are only 2 cases so droping 2 rows
df_subset = df.dropna(subset=['embark_town']).copy()

#Imputing deck with mode value
mode_value = df_subset[df_subset['deck'].notna()]['deck'].mode()[0]
#df_subset = df_subset['deck'].fillna(mode_value)


#Imputing age with mean value
df_subset.loc[:, 'age'] = df_subset['age'].fillna(df_subset['age'].mean())
df_subset.loc[:, 'deck'] = df_subset['deck'].fillna(df_subset['deck'].mode()[0])
#df_notna.shape
#print(df_subset.notna().sum())


#Encode the categorical features
df_subset = pd.get_dummies(df_subset, columns=['sex'], drop_first=True)
df_subset = pd.get_dummies(df_subset, columns=['embarked'], drop_first=True)
df_subset = pd.get_dummies(df_subset, columns=['deck'], drop_first=True)


# Train a Decision Tree model
x = df_subset.drop('survived', axis=1)
y = df_subset.iloc[:,0]

# Drop non-numeric columns before splitting
x = x.select_dtypes(include=np.number)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


params = {
    'criterion': ["squared_error", "absolute_error", "friedman_mse", 'poisson'],
    'splitter': ["best", "random"],
    'max_depth': [1, 2, 3, 4, 10],
    'max_features':['sqrt', 'log2', None] # Removed 'auto'
}
regressor = DecisionTreeRegressor()
model = GridSearchCV(regressor, param_grid=params, cv=5, scoring='neg_mean_squared_error', verbose=3)
model.fit(x_train, y_train)

#Check the best model
model.best_params_

final_model = DecisionTreeRegressor(criterion = 'squared_error', max_depth = 4, max_features = 'log2', splitter = 'random')
final_model.fit(x_train, y_train)

y_pred = final_model.predict(x_test)
r2_score(y_pred, y_test)


Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best;, score=-0.228 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best;, score=-0.200 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best;, score=-0.202 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best;, score=-0.225 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=best;, score=-0.225 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=random;, score=-0.212 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=random;, score=-0.200 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=1, max_features=sqrt, splitter=random;, score=-0.233 total 

-3.5547291537662877