<a href="https://colab.research.google.com/github/Nachi2006/MLREPO/blob/main/Adult_Salary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#CODECHEF-VIT RECRUITMENTS 2025

In [None]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
           "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
data = pd.read_csv(url, names=columns, skipinitialspace=True)

In [None]:
!pip install xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.25.1 xgboost-2.1.4


In [None]:
class DataAnalyzer:
    def __init__(self, data):
        """Initialize with a pandas DataFrame"""
        self.data = data
        self.numerical_features = None
        self.categorical_features = None
        self.target = None

    def analyze_data_quality(self):
        """Check data quality: missing values, duplicates, and data types"""
        print("=== Data Quality Analysis ===")
        print("\nDataset Shape:", self.data.shape)
        print("\nData Types:\n", self.data.dtypes)
        print("\nMissing Values:\n", self.data.isnull().sum())
        print("\nDuplicate Rows:", self.data.duplicated().sum())

        # Memory usage
        memory_usage = self.data.memory_usage(deep=True).sum() / 1024**2
        print(f"\nMemory Usage: {memory_usage:.2f} MB")

    def clean_data(self):
        """Clean the dataset"""
        print("\n=== Data Cleaning ===")

        # Remove duplicates
        initial_rows = len(self.data)
        self.data.drop_duplicates(inplace=True)
        print(f"Removed {initial_rows - len(self.data)} duplicate rows")

        # Handle missing values
        for column in self.data.columns:
            if self.data[column].isnull().sum() > 0:
                if self.data[column].dtype in ['int64', 'float64']:
                    # Fill numerical missing values with median
                    self.data[column].fillna(self.data[column].median(), inplace=True)
                else:
                    # Fill categorical missing values with mode
                    self.data[column].fillna(self.data[column].mode()[0], inplace=True)

        print("Missing values handled")

    def perform_eda(self):
        """Perform Exploratory Data Analysis"""
        print("\n=== Exploratory Data Analysis ===")

        # Separate numerical and categorical columns
        self.numerical_features = self.data.select_dtypes(include=['int64', 'float64']).columns
        self.categorical_features = self.data.select_dtypes(include=['object']).columns

        # Descriptive statistics
        print("\nNumerical Features Summary:")
        print(self.data[self.numerical_features].describe())

        print("\nCategorical Features Summary:")
        for cat_col in self.categorical_features:
            print(f"\n{cat_col} value counts:")
            print(self.data[cat_col].value_counts().head())

        # Create visualizations directory
        import os
        os.makedirs('visualizations', exist_ok=True)

        # Distribution plots for numerical features
        self._plot_distributions()

        # Correlation analysis
        self._plot_correlation()

        # Categorical feature analysis
        self._analyze_categorical_features()

        if 'date' in self.data.columns:
            self._analyze_time_trends()

    def _plot_distributions(self):
        """Plot distributions for numerical features"""
        for col in self.numerical_features:
            plt.figure(figsize=(10, 6))
            sns.histplot(self.data[col], kde=True)
            plt.title(f'Distribution of {col}')
            plt.savefig(f'visualizations/{col}_distribution.png')
            plt.close()

    def _plot_correlation(self):
        """Plot correlation heatmap"""
        plt.figure(figsize=(12, 8))
        sns.heatmap(self.data[self.numerical_features].corr(),
                   annot=True, cmap='coolwarm', center=0)
        plt.title('Correlation Heatmap')
        plt.tight_layout()
        plt.savefig('visualizations/correlation_heatmap.png')
        plt.close()

    def _analyze_categorical_features(self):
        """Analyze categorical features"""
        for col in self.categorical_features:
            plt.figure(figsize=(10, 6))
            self.data[col].value_counts().plot(kind='bar')
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f'visualizations/{col}_distribution.png')
            plt.close()

    def _analyze_time_trends(self):
        """Analyze time-based trends if date column exists"""
        if 'date' in self.data.columns:
            self.data['date'] = pd.to_datetime(self.data['date'])
            self.data['month'] = self.data['date'].dt.month
            self.data['day_of_week'] = self.data['date'].dt.dayofweek

            # Monthly trends
            plt.figure(figsize=(12, 6))
            monthly_data = self.data.groupby('month').size()
            monthly_data.plot(kind='line', marker='o')
            plt.title('Monthly Trends')
            plt.savefig('visualizations/monthly_trends.png')
            plt.close()

class IncomePredictor:
    def __init__(self, data, target_column='income'):
        """Initialize with data and target column name"""
        self.data = data
        self.target_column = target_column
        self.models = {}
        self.best_model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def prepare_data(self):
        """Prepare data for modeling"""
        print("\n=== Preparing Data for Modeling ===")

        # Separate features and target
        X = self.data.drop(self.target_column, axis=1)
        y = (self.data[self.target_column] == '>50K').astype(int)  # Convert to binary

        # Identify numerical and categorical columns
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns

        # Create copies to avoid SettingWithCopyWarning
        X_processed = X.copy()

        # Scale numerical features
        scaler = StandardScaler()
        X_processed[numerical_features] = scaler.fit_transform(X_processed[numerical_features])

        # Encode categorical features
        for column in categorical_features:
            # Create dummy variables
            dummies = pd.get_dummies(X_processed[column], prefix=column, drop_first=True)
            # Add dummy columns to processed features
            X_processed = pd.concat([X_processed, dummies], axis=1)
            # Drop original categorical column
            X_processed.drop(column, axis=1, inplace=True)

        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_processed, y, test_size=0.2, random_state=42
        )

        print("Data preparation completed")
        print(f"Training set shape: {self.X_train.shape}")
        print(f"Testing set shape: {self.X_test.shape}")

        # Store feature names for later use
        self.feature_names = X_processed.columns.tolist()

    def train_models(self):
        """Train multiple classification models"""
        print("\n=== Training Models ===")

        # Define models
        self.models = {
            'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
            'Decision Tree': DecisionTreeClassifier(random_state=42),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'XGBoost': xgb.XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=6,
                random_state=42,
                eval_metric='logloss'
            ),
            'SVM': SVC(probability=True, random_state=42)
        }

        # Train and evaluate each model
        results = {}
        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            model.fit(self.X_train, self.y_train)

            # Make predictions
            y_pred = model.predict(self.X_test)

            # Calculate metrics
            accuracy = accuracy_score(self.y_test, y_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(
                self.y_test, y_pred, average='binary'
            )

            results[name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }

            print(f"{name} Results:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1-score: {f1:.4f}")

            # Add feature importance for tree-based models
            if name in ['Decision Tree', 'Random Forest', 'XGBoost']:
                self._plot_feature_importance(model, name)

        # Plot ROC curves
        self._plot_roc_curves()

        # Select best model
        self.best_model = max(results.items(), key=lambda x: x[1]['f1'])[0]
        print(f"\nBest performing model: {self.best_model}")

    def _plot_feature_importance(self, model, name):
        """Plot feature importance for tree-based models"""
        if hasattr(model, 'feature_importances_'):
            importance = model.feature_importances_
            indices = np.argsort(importance)[::-1]

            plt.figure(figsize=(12, 6))
            plt.title(f'Feature Importance ({name})')
            plt.bar(range(len(importance)), importance[indices])
            plt.xticks(range(len(importance)),
                      [self.feature_names[i] for i in indices],
                      rotation=45, ha='right')
            plt.tight_layout()
            plt.savefig(f'visualizations/feature_importance_{name.lower().replace(" ", "_")}.png')
            plt.close()

    def _plot_roc_curves(self):
        """Plot ROC curves for all models"""
        plt.figure(figsize=(10, 8))

        for name, model in self.models.items():
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            roc_auc = auc(fpr, tpr)

            plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for Different Models')
        plt.legend(loc="lower right")
        plt.savefig('visualizations/roc_curves.png')
        plt.close()

def main():
    """Main function to run the analysis"""

    np.random.seed(42)

    # Initialize analyzers
    data_analyzer = DataAnalyzer(data)

    # Perform data quality analysis and cleaning
    data_analyzer.analyze_data_quality()
    data_analyzer.clean_data()

    # Perform EDA
    data_analyzer.perform_eda()

    # Prepare for income prediction
    income_predictor = IncomePredictor(data)
    income_predictor.prepare_data()
    income_predictor.train_models()

if __name__ == "__main__":
    main()


=== Data Quality Analysis ===

Dataset Shape: (32561, 15)

Data Types:
 age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

Missing Values:
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

Duplicate Rows: 24

Memory Usage: 19.88 MB

=== Data Cleaning ===
Removed 24 duplicate rows
Missing values handled

=== Exploratory Data Analysis ===

Numerical Features Summary:
                age        f

#Step 4: Results and Inferences

**Guidelines**: List out your inferences here

In [None]:
"""
Observation 1: Average age is around 38
Observation 2: Most frequent education level is High School Graduate
Observation 3: The dataset is imbalanced which could lead to some misleading outputs
Observation 4: All models have around 85% accuracy which is pretty good
Observation 5: XGBoost is the best performing model

Note: If we had to fix the dataset imbalance we could oversample the minority class using smote """