# Machine Learning Classification Project

*Generated by Classify AI - Multi-Agent System*

## Project Information


**Project Description**: Test classification workflow
**Target Variable**: target
**Dataset Shape**: 1000 rows × 5 columns
**Generated on**: 2025-10-14 01:20:41
**Session ID**: test_session_123


## Required Libraries and Imports

In [None]:

# Core data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                           classification_report, confusion_matrix, roc_auc_score, roc_curve)
from sklearn.feature_selection import SelectKBest, f_classif

# Advanced ML libraries
import xgboost as xgb
import lightgbm as lgb

# Data visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Model persistence
import joblib
import pickle

# Statistical analysis
from scipy import stats
from scipy.stats import chi2_contingency

print("All required libraries imported successfully!")


## 1. Dataset Loading and Initial Exploration

In [None]:

# Load the dataset
df = pd.read_csv('dataset.csv')

# Basic dataset information
print("Dataset Information:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Target column: 'target'")
print("\nFirst few rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())


## 2. Comprehensive Data Cleaning

### 2.1 Missing Value Analysis

In [None]:

# Comprehensive missing value analysis
def analyze_missing_values(df, target_column=None):
    """Analyze missing values in the dataset"""
    missing_stats = df.isnull().sum()
    missing_percent = (missing_stats / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Column': missing_stats.index,
        'Missing_Count': missing_stats.values,
        'Missing_Percentage': missing_percent.values
    }).sort_values('Missing_Percentage', ascending=False)
    
    print("Missing Value Analysis:")
    print(missing_df[missing_df['Missing_Count'] > 0])
    
    # Visualize missing values
    if missing_stats.sum() > 0:
        plt.figure(figsize=(12, 6))
        missing_df[missing_df['Missing_Count'] > 0].plot(x='Column', y='Missing_Percentage', kind='bar')
        plt.title('Missing Values by Column')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    return missing_df

missing_analysis = analyze_missing_values(df, '{state.get('target_column', 'target')}')


### 2.2 Data Type Validation and Conversion

In [None]:

# Data type validation and conversion
def validate_and_convert_types(df):
    """Validate and convert data types appropriately"""
    df_cleaned = df.copy()
    
    for col in df_cleaned.columns:
        # Try to convert to numeric
        if df_cleaned[col].dtype == 'object':
            # Check if it's actually numeric
            numeric_converted = pd.to_numeric(df_cleaned[col], errors='coerce')
            if not numeric_converted.isna().all():
                df_cleaned[col] = numeric_converted
                print(f"Converted {{col}} to numeric")
        
        # Check for datetime columns
        if df_cleaned[col].dtype == 'object':
            try:
                pd.to_datetime(df_cleaned[col], errors='raise')
                df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='coerce')
                print(f"Converted {{col}} to datetime")
            except:
                pass
    
    print("\nData types after conversion:")
    print(df_cleaned.dtypes)
    return df_cleaned

df_typed = validate_and_convert_types(df)


### 2.3 Outlier Detection

In [None]:

# Comprehensive outlier detection
def detect_outliers_iqr(df, columns=None):
    """Detect outliers using IQR method"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_info = {}
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_info[col] = {
            'count': len(outliers),
            'percentage': (len(outliers) / len(df)) * 100,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        }
    
    return outlier_info

def detect_outliers_zscore(df, columns=None, threshold=3):
    """Detect outliers using Z-score method"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_info = {}
    
    for col in columns:
        z_scores = np.abs(stats.zscore(df[col].dropna()))
        outliers = df[z_scores > threshold]
        outlier_info[col] = {
            'count': len(outliers),
            'percentage': (len(outliers) / len(df)) * 100
        }
    
    return outlier_info

# Detect outliers
numeric_columns = df_typed.select_dtypes(include=[np.number]).columns
iqr_outliers = detect_outliers_iqr(df_typed, numeric_columns)
zscore_outliers = detect_outliers_zscore(df_typed, numeric_columns)

print("Outlier Detection Results:")
for col in numeric_columns:
    if iqr_outliers[col]['count'] > 0:
        print(f"{{col}}: {{iqr_outliers[col]['count']}} outliers ({{iqr_outliers[col]['percentage']:.2f}}%)")


### 2.4 Missing Value Imputation

In [None]:

# Advanced missing value imputation
def impute_missing_values(df, target_column=None):
    """Impute missing values using appropriate strategies"""
    df_imputed = df.copy()
    
    for col in df_imputed.columns:
        if df_imputed[col].isnull().sum() > 0:
            if df_imputed[col].dtype in ['int64', 'float64']:
                # For numeric columns, use median for robustness
                df_imputed[col].fillna(df_imputed[col].median(), inplace=True)
                print(f"Imputed {{col}} with median: {{df_imputed[col].median()}}")
            else:
                # For categorical columns, use mode
                mode_value = df_imputed[col].mode()[0] if not df_imputed[col].mode().empty else 'Unknown'
                df_imputed[col].fillna(mode_value, inplace=True)
                print(f"Imputed {{col}} with mode: {{mode_value}}")
    
    return df_imputed

df_imputed = impute_missing_values(df_typed, '{state.get('target_column', 'target')}')
print(f"\nMissing values after imputation: {{df_imputed.isnull().sum().sum()}}")


### 2.5 Data Quality Assessment

In [None]:

# Data quality assessment
def assess_data_quality(df, target_column):
    """Assess overall data quality"""
    quality_metrics = {}
    
    # Completeness
    completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
    quality_metrics['completeness'] = completeness
    
    # Consistency (check for duplicates)
    duplicates = df.duplicated().sum()
    consistency = (1 - duplicates / len(df)) * 100
    quality_metrics['consistency'] = consistency
    
    # Validity (check for reasonable values)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    validity_score = 100
    for col in numeric_cols:
        if col != target_column:
            # Check for infinite values
            inf_count = np.isinf(df[col]).sum()
            if inf_count > 0:
                validity_score -= (inf_count / len(df)) * 100
    
    quality_metrics['validity'] = max(0, validity_score)
    
    # Overall quality score
    overall_quality = np.mean(list(quality_metrics.values()))
    quality_metrics['overall'] = overall_quality
    
    return quality_metrics

quality_metrics = assess_data_quality(df_imputed, '{state.get('target_column', 'target')}')
print("Data Quality Metrics:")
for metric, score in quality_metrics.items():
    print(f"{{metric.title()}}: {{score:.2f}}%")


## 3. Feature Engineering and Selection

In [None]:

# Feature engineering and selection
def engineer_features(df, target_column):
    """Engineer features for machine learning"""
    df_features = df.copy()

# Handle categorical variables
    categorical_columns = df_features.select_dtypes(include=['object']).columns
    categorical_columns = [col for col in categorical_columns if col != target_column]
    
    if len(categorical_columns) > 0:
        # One-hot encoding for categorical variables
        df_features = pd.get_dummies(df_features, columns=categorical_columns, drop_first=True)
        print(f"One-hot encoded {{len(categorical_columns)}} categorical columns")
    
    # Feature scaling
    numeric_columns = df_features.select_dtypes(include=[np.number]).columns
    numeric_columns = [col for col in numeric_columns if col != target_column]
    
    if len(numeric_columns) > 0:
        scaler = StandardScaler()
        df_features[numeric_columns] = scaler.fit_transform(df_features[numeric_columns])
        print(f"Scaled {{len(numeric_columns)}} numeric columns")
    
    return df_features

df_features = engineer_features(df_imputed, '{state.get('target_column', 'target')}')
print(f"\nFeature matrix shape: {{df_features.shape}}")


## 4. Model Training and Selection

In [None]:

# Prepare data for training
X = df_features.drop(columns=['target'])
y = df_features['target']

# Split data (CRITICAL: Split before any preprocessing to prevent data leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Target distribution in training set: {y_train.value_counts().to_dict()}")

# Define multiple models for comparison
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1),
    'SVM': SVC(random_state=42, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate models
model_results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")

# Train model
model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    model_results[name] = {
        'model': model,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred_test': y_pred_test
    }
    
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"CV Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Find best model
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['test_accuracy'])
best_model = model_results[best_model_name]['model']
print(f"\nBest Model: {best_model_name}")
print(f"Best Test Accuracy: {model_results[best_model_name]['test_accuracy']:.4f}")


### 4.1 Hyperparameter Tuning

In [None]:

# Hyperparameter tuning for the best model
def tune_hyperparameters(model, X_train, y_train, model_name):
    """Tune hyperparameters using GridSearchCV"""
    
    if 'Random Forest' in model_name:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    elif 'Logistic Regression' in model_name:
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga']
        }
    elif 'XGBoost' in model_name:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 0.9, 1.0]
        }
    else:
        # Default parameter grid
        param_grid = {}
    
    if param_grid:
        grid_search = GridSearchCV(
            model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
        )
        grid_search.fit(X_train, y_train)
        
        print(f"Best parameters for {model_name}:")
        print(grid_search.best_params_)
        print(f"Best CV score: {grid_search.best_score_:.4f}")
        
        return grid_search.best_estimator_
    else:
        return model

# Tune hyperparameters for the best model
tuned_model = tune_hyperparameters(best_model, X_train, y_train, best_model_name)


## 5. Comprehensive Model Evaluation

In [None]:

# Comprehensive model evaluation
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance comprehensively"""

# Make predictions
y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Classification report
    print(f"\n=== {{model_name}} Evaluation Results ===")
    print(f"Accuracy: {{accuracy:.4f}}")
    print(f"Precision: {{precision:.4f}}")
    print(f"Recall: {{recall:.4f}}")
    print(f"F1-Score: {{f1:.4f}}")
    
    print("\nClassification Report:")
print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {{model_name}}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # ROC Curve (if probabilities available)
    if y_pred_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {{roc_auc:.2f}})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {{model_name}}')
        plt.legend(loc="lower right")
        plt.show()
    
    return {{
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }}

# Evaluate the tuned model
evaluation_results = evaluate_model(tuned_model, X_test, y_test, f"Tuned {{best_model_name}}")


### 5.1 Feature Importance Analysis

In [None]:

# Feature importance analysis
def analyze_feature_importance(model, X_train, feature_names):
    """Analyze and visualize feature importance"""
    
if hasattr(model, 'feature_importances_'):
        # Tree-based models
        importance_df = pd.DataFrame({{
            'feature': feature_names,
        'importance': model.feature_importances_
        }}).sort_values('importance', ascending=False)
        
        print("Top 10 Most Important Features:")
        print(importance_df.head(10))
        
        # Plot feature importance
        plt.figure(figsize=(10, 8))
        top_features = importance_df.head(15)
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Feature Importance')
        plt.title('Top 15 Most Important Features')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        return importance_df
    
    elif hasattr(model, 'coef_'):
        # Linear models
        coef_df = pd.DataFrame({{
            'feature': feature_names,
            'coefficient': model.coef_[0] if len(model.coef_.shape) > 1 else model.coef_
        }}).sort_values('coefficient', key=abs, ascending=False)
        
        print("Top 10 Most Important Features (by coefficient magnitude):")
        print(coef_df.head(10))
        
        # Plot coefficients
        plt.figure(figsize=(10, 8))
        top_features = coef_df.head(15)
        colors = ['red' if x < 0 else 'blue' for x in top_features['coefficient']]
        plt.barh(range(len(top_features)), top_features['coefficient'], color=colors)
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Coefficient Value')
        plt.title('Top 15 Most Important Features (Coefficients)')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        return coef_df
    
    else:
        print("Feature importance not available for this model type")
        return None

# Analyze feature importance
feature_importance = analyze_feature_importance(tuned_model, X_train, X.columns)


## 6. Model Persistence and Deployment

In [None]:

# Save the trained model and preprocessing objects
import joblib
from datetime import datetime

# Create a model package
model_package = {{
    'model': tuned_model,
    'scaler': scaler if 'scaler' in locals() else None,
    'feature_names': list(X.columns),
    'target_column': '{state.get('target_column', 'target')}',
    'model_name': best_model_name,
    'training_date': datetime.now().isoformat(),
    'performance_metrics': evaluation_results
}}

# Save model
model_filename = f"trained_model_{{datetime.now().strftime('%Y%m%d_%H%M%S')}}.joblib"
joblib.dump(model_package, model_filename)
print(f"Model saved as: {{model_filename}}")

# Save preprocessing pipeline
pipeline_filename = f"preprocessing_pipeline_{{datetime.now().strftime('%Y%m%d_%H%M%S')}}.joblib"
preprocessing_pipeline = {{
    'imputer': None,  # Add imputer if used
    'scaler': scaler if 'scaler' in locals() else None,
    'encoder': None,  # Add encoder if used
    'feature_names': list(X.columns)
}
joblib.dump(preprocessing_pipeline, pipeline_filename)
print(f"Preprocessing pipeline saved as: {{pipeline_filename}}")

# Example of how to load and use the model
def load_and_predict(model_path, new_data):
    """Load model and make predictions on new data"""
    model_package = joblib.load(model_path)
    model = model_package['model']
    
    # Preprocess new data (same steps as training)
    # ... preprocessing code ...
    
    predictions = model.predict(new_data)
    return predictions

print("\nModel persistence completed successfully!")


## 7. Usage Instructions and Next Steps


# Usage Instructions

## How to Use This Model

### 1. Loading the Model
```python
import joblib
model_package = joblib.load('trained_model_YYYYMMDD_HHMMSS.joblib')
model = model_package['model']
```

### 2. Making Predictions
```python
# Load new data
new_data = pd.read_csv('new_data.csv')

# Apply same preprocessing steps
# ... (use the preprocessing pipeline from this notebook)

# Make predictions
predictions = model.predict(new_data)
probabilities = model.predict_proba(new_data)  # if available
```

### 3. Model Monitoring
- Monitor model performance over time
- Retrain when performance degrades
- Track prediction distributions

## Next Steps

1. **Deploy to Production**: Set up model serving infrastructure
2. **Monitor Performance**: Implement monitoring and alerting
3. **Continuous Learning**: Set up retraining pipeline
4. **A/B Testing**: Compare with baseline models
5. **Feature Engineering**: Explore additional features

## Model Performance Summary

- **Best Model**: {best_model_name}
- **Test Accuracy**: {evaluation_results['accuracy']:.4f}
- **F1-Score**: {evaluation_results['f1']:.4f}
- **Precision**: {evaluation_results['precision']:.4f}
- **Recall**: {evaluation_results['recall']:.4f}

## Important Notes

- Always apply the same preprocessing steps to new data
- Monitor for data drift and concept drift
- Retrain the model periodically with new data
- Document any changes to the preprocessing pipeline
