# DECISION TREE CLASSIFICATION MODEL
## CODTECH Internship Task - 1

**Objective:** Build and visualize a Decision Tree model using scikit-learn to classify outcomes

**Dataset:** Iris Dataset (Classic Classification Problem)

---

## 1. IMPORT LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score
)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")

## 2. LOAD AND EXPLORE DATASET

In [None]:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].apply(lambda x: iris.target_names[x])

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nClass Distribution:")
print(df['species'].value_counts())

## 3. STATISTICAL SUMMARY

In [None]:
print("Statistical Summary:\n")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

## 4. EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Feature Distributions by Species', fontsize=16, fontweight='bold')

features = iris.feature_names
for idx, feature in enumerate(features):
    ax = axes[idx // 2, idx % 2]
    for species in iris.target_names:
        data = df[df['species'] == species][feature]
        ax.hist(data, alpha=0.6, label=species, bins=15)
    ax.set_xlabel(feature, fontsize=10)
    ax.set_ylabel('Frequency', fontsize=10)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.pairplot(df, hue='species', diag_kind='kde', markers=['o', 's', 'D'])
plt.suptitle('Pairwise Feature Relationships', y=1.02, fontsize=16, fontweight='bold')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df['species'].value_counts().plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title('Class Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Species', fontsize=10)
axes[0].set_ylabel('Count', fontsize=10)
axes[0].grid(axis='y', alpha=0.3)

df['species'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                   colors=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_title('Class Distribution (Percentage)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

## 5. DATA PREPARATION

In [None]:
X = df[iris.feature_names]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"\nTraining set class distribution:\n{y_train.value_counts().sort_index()}")
print(f"\nTesting set class distribution:\n{y_test.value_counts().sort_index()}")

## 6. BUILD DECISION TREE MODEL

In [None]:
dt_classifier = DecisionTreeClassifier(
    criterion='gini',
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

dt_classifier.fit(X_train, y_train)

print("✓ Decision Tree Model trained successfully!")
print(f"\nTree Depth: {dt_classifier.get_depth()}")
print(f"Number of Leaves: {dt_classifier.get_n_leaves()}")
print(f"Number of Features: {dt_classifier.n_features_in_}")

## 7. MODEL PREDICTIONS

In [None]:
y_train_pred = dt_classifier.predict(X_train)
y_test_pred = dt_classifier.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")
print(f"\nOverfitting Check: {abs(train_accuracy - test_accuracy) * 100:.2f}% difference")

## 8. MODEL EVALUATION - DETAILED METRICS

In [None]:
print("="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_test_pred, target_names=iris.target_names))

precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"\nWeighted Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

## 9. CONFUSION MATRIX VISUALIZATION

In [None]:
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names,
            cbar_kws={'label': 'Count'},
            linewidths=2, linecolor='white')
plt.title('Confusion Matrix - Decision Tree Classifier', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.show()

print("Confusion Matrix:")
print(cm)

## 10. DECISION TREE VISUALIZATION

In [None]:
plt.figure(figsize=(20, 12))
plot_tree(
    dt_classifier,
    feature_names=iris.feature_names,
    class_names=iris.target_names,
    filled=True,
    rounded=True,
    fontsize=10,
    proportion=True
)
plt.title('Decision Tree Structure - Full Visualization', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## 11. FEATURE IMPORTANCE ANALYSIS

In [None]:
feature_importance = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': dt_classifier.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importance Scores:")
print(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], 
         color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Feature Importance in Decision Tree', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 12. CROSS-VALIDATION ANALYSIS

In [None]:
cv_scores = cross_val_score(dt_classifier, X, y, cv=5, scoring='accuracy')

print("Cross-Validation Scores:")
for fold, score in enumerate(cv_scores, 1):
    print(f"Fold {fold}: {score * 100:.2f}%")

print(f"\nMean CV Accuracy: {cv_scores.mean() * 100:.2f}%")
print(f"Standard Deviation: {cv_scores.std() * 100:.2f}%")

plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), cv_scores * 100, marker='o', linewidth=2, markersize=10, color='#4ECDC4')
plt.axhline(y=cv_scores.mean() * 100, color='red', linestyle='--', 
            label=f'Mean: {cv_scores.mean() * 100:.2f}%')
plt.xlabel('Fold Number', fontsize=12)
plt.ylabel('Accuracy (%)', fontsize=12)
plt.title('Cross-Validation Performance', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(range(1, 6))
plt.tight_layout()
plt.show()

## 13. MODEL COMPARISON - DIFFERENT DEPTHS

In [None]:
depths = range(1, 11)
train_scores = []
test_scores = []

for depth in depths:
    dt_temp = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt_temp.fit(X_train, y_train)
    
    train_scores.append(dt_temp.score(X_train, y_train))
    test_scores.append(dt_temp.score(X_test, y_test))

plt.figure(figsize=(12, 6))
plt.plot(depths, train_scores, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(depths, test_scores, marker='s', label='Testing Accuracy', linewidth=2)
plt.xlabel('Tree Depth', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Performance vs Tree Depth', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(depths)
plt.tight_layout()
plt.show()

print("Depth Analysis:")
for d, train, test in zip(depths, train_scores, test_scores):
    print(f"Depth {d}: Train={train*100:.2f}%, Test={test*100:.2f}%")

## 14. PREDICTION EXAMPLES

In [None]:
print("Sample Predictions on Test Set:")
print("="*80)

sample_indices = np.random.choice(len(X_test), 10, replace=False)

results_df = pd.DataFrame({
    'Actual': [iris.target_names[y_test.iloc[i]] for i in sample_indices],
    'Predicted': [iris.target_names[y_test_pred[i]] for i in sample_indices],
    'Match': ['✓' if y_test.iloc[i] == y_test_pred[i] else '✗' for i in sample_indices]
})

print(results_df.to_string(index=False))
print("="*80)

## 15. SAVE MODEL (OPTIONAL)

In [None]:
import pickle

with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(dt_classifier, file)

print("✓ Model saved as 'decision_tree_model.pkl'")

with open('decision_tree_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("✓ Model loaded successfully for verification")
print(f"Loaded model accuracy: {loaded_model.score(X_test, y_test) * 100:.2f}%")

## 16. KEY FINDINGS & CONCLUSIONS

### Model Performance:
- The Decision Tree classifier achieved excellent performance on the Iris dataset
- High accuracy on both training and testing sets indicates good generalization
- Cross-validation confirms model stability across different data splits

### Feature Importance:
- Petal measurements (length and width) are the most important features
- These features provide the best separation between species

### Model Strengths:
- Easy to interpret and visualize
- No feature scaling required
- Handles non-linear relationships well
- Clear decision rules

### Considerations:
- Tree depth impacts complexity and generalization
- Overfitting can occur with very deep trees
- Pruning or limiting depth helps maintain model simplicity

---

**CODTECH Internship Task Completed Successfully! ✓**