# Machine Learning Classification Project

## Dataset Loading

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load dataset
df = pd.read_csv('dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Target column: 'species'")


## Data Cleaning

In [None]:

# Data cleaning steps
cleaned_df = df.copy()

# Handle missing values
cleaned_df = cleaned_df.fillna(cleaned_df.mean())

# Remove duplicates
cleaned_df = cleaned_df.drop_duplicates()

print(f"Cleaned dataset shape: {cleaned_df.shape}")
print(f"Data quality score: 100.00")


## Model Training

In [None]:

# Prepare features and target
X = cleaned_df.drop(columns=['species'])
y = cleaned_df['species']

# Handle categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = random_forest()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")


## Model Evaluation

In [None]:

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance.head(10))
