In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load and prepare data
def load_data(filename):
    """
    Load data from CSV and handle basic preprocessing
    """
    df = pd.read_csv(filename)
    return df

def preprocess_data(df, target_column):
    """
    Preprocess the data including handling missing values and feature engineering
    """
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    # Handle categorical variables
    categorical_columns = df.select_dtypes(include=['object']).columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
    
    # Convert categorical variables to dummy variables
    df = pd.get_dummies(df, columns=[col for col in categorical_columns if col != target_column])
    
    return df

def split_data(df, target_column, test_size=0.2, random_state=42):
    """
    Split data into training and testing sets
    """
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    return X_train, X_test, y_train, y_test

# 2. Feature scaling
def scale_features(X_train, X_test):
    """
    Scale features using StandardScaler
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, scaler

# 3. Model training and evaluation
def train_model(X_train, y_train, n_estimators=100, random_state=42):
    """
    Train a Random Forest model
    """
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate model performance
    """
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Generate classification report
    report = classification_report(y_test, y_pred)
    
    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy, report, cm, y_pred

# 4. Visualization
def plot_confusion_matrix(cm, classes):
    """
    Plot confusion matrix using seaborn
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

def plot_feature_importance(model, feature_names):
    """
    Plot feature importance
    """
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title("Feature Importances")
    plt.bar(range(len(indices)), importances[indices])
    plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# 5. Main execution
def main():
    # Example usage with iris dataset
    from sklearn.datasets import load_iris
    iris = load_iris()
    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], 
                     columns=iris['feature_names'] + ['target'])
    
    # Preprocess data
    df_processed = preprocess_data(df, 'target')
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(df_processed, 'target')
    
    # Scale features
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)
    
    # Train model
    model = train_model(X_train_scaled, y_train)
    
    # Evaluate model
    accuracy, report, cm, y_pred = evaluate_model(model, X_test_scaled, y_test)
    
    # Print results
    print(f"Accuracy: {accuracy:.2f}")
    print("\nClassification Report:")
    print(report)
    
    # Plot results
    plot_confusion_matrix(cm, classes=iris['target_names'])
    plot_feature_importance(model, X_train.columns)



ModuleNotFoundError: No module named 'seaborn'