In [None]:
import subprocess
import sys

def install_eda_libraries():
    """
    Installs the most common libraries for performing Exploratory Data Analysis (EDA),
    excluding less frequently used ones.
    """
    libraries = [
        # Data manipulation and processing
        'pandas',       # Tabular data manipulation
        'numpy',        # Numerical operations
        'scipy',        # Mathematical and statistical calculations

        # Visualization
        'matplotlib',   # Basic plots
        'seaborn',      # Advanced visualization
        'plotly',       # Interactive plots
        'missingno',    # Visualization of missing data

        # Statistical Analysis
        'statsmodels',  # Statistical models

        # Data Profiling
        'pandas-profiling',  # Automatic EDA reports (now known as ydata-profiling)

        # Encoding and manipulating categorical variables
        'category_encoders',  # Tools for encoding categorical variables

        # Correlation Analysis
        'phik',  # Advanced correlation metrics

        # Multivariate Analysis
        'yellowbrick',  # Visualization of ML metrics and data analysis

        # Missing Data Management
        'sklearn-pandas'  # Preprocessing pipelines with sklearn integration
    ]
    
    for library in libraries:
        try:
            __import__(library)
            print(f"✅ The library '{library}' is already installed.")
        except ImportError:
            print(f"⚠️ Installing '{library}'...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", library])
    print("\n🚀 All necessary libraries for EDA are installed and ready to use.")

# Run the function
if __name__ == "__main__":
    install_eda_libraries()


#python install_eda.py

In [None]:
def import_eda_libraries():
    """
    Imports the most common libraries for Exploratory Data Analysis (EDA)
    and configures them for optimal use.
    """
    global pd, np, plt, sns, missingno, statsmodels, sp, go, profile, ce, phik, yellowbrick
    
    try:
        # 📊 Data Manipulation and Processing
        import pandas as pd       # Tabular data manipulation
        import numpy as np        # Numerical operations
        import scipy as sp        # Mathematical and statistical calculations
        
        # 📈 Visualization
        import matplotlib.pyplot as plt  # Basic plots
        import seaborn as sns     # Advanced statistical plots
        import plotly.graph_objects as go  # Interactive plots
        import missingno          # Visualization of missing data patterns
        
        # 📊 Statistical Analysis
        import statsmodels.api as sm  # Statistical models
        
        # 📑 Data Profiling
        from ydata_profiling import ProfileReport as profile  # Automatic data profiling reports
        
        # 🏷️ Categorical Variable Encoding
        import category_encoders as ce  # Advanced encoding methods for categorical variables
        
        # 🔗 Correlation Analysis
        import phik  # Correlation analysis between categorical and numerical variables
        
        # 📊 Multivariate Analysis
        import yellowbrick  # Visualization of ML metrics and data patterns
        
        # ⚙️ Basic Configurations
        sns.set(style="whitegrid")
        plt.style.use('ggplot')
        pd.options.display.float_format = '{:.2f}'.format  # Display floats with two decimals
        
        print("✅ All libraries for EDA have been successfully imported.")
    
    except ImportError as e:
        print(f"❌ Error importing libraries: {e}")
        print("🔄 Make sure you have executed the library installation function first.")

# Run the function
if __name__ == "__main__":
    import_eda_libraries()


#python import_eda.py

In [None]:
import subprocess
import sys

def install_ml_libraries():
    """
    Installs the most common libraries for Machine Learning, excluding less frequently used ones.
    """
    libraries = [
        # 📊 Data Manipulation and Processing
        'numpy',              # Numerical operations
        'pandas',             # Tabular data manipulation
        'scipy',              # Mathematical and statistical calculations
        
        # 📈 Visualization
        'matplotlib',         # Static plots
        'seaborn',            # Advanced visualization
        'plotly',             # Interactive plots
        
        # 🤖 General Machine Learning
        'scikit-learn',       # Classic Machine Learning algorithms
        'xgboost',            # Gradient Boosting for tabular data
        'lightgbm',           # Efficient Gradient Boosting
        'catboost',           # Optimized Gradient Boosting
        
        # 🧠 Deep Learning
        'tensorflow',         # Neural networks and Deep Learning
        'keras',              # High-level API for TensorFlow
        'torch',              # Deep Learning framework (PyTorch)
        'transformers',       # Advanced NLP models (BERT, GPT, etc.)
        
        # 🔍 Model Optimization
        'optuna',             # Hyperparameter optimization
        'hyperopt',           # Alternative for hyperparameter optimization
        
        # 📦 Model Tracking and Management
        'mlflow',             # ML experiment tracking
        'dvc',               # Data version control
        
        # 📊 Statistical Analysis
        'statsmodels',        # Statistical models
        
        # ⚙️ Text and NLP Processing
        'nltk',              # Natural Language Processing
        'spacy',             # Efficient NLP processing
        
        # 🏗️ Data Processing
        'imblearn',           # Handling imbalanced data
        'joblib',            # Model serialization
        
        # 📊 Visualization and Analysis
        'yellowbrick',        # ML metrics visualization
        
        # 🛡️ Model Validation
        'shap',              # Model interpretability
        'lime',              # Local interpretation of predictions
        
        # 🔗 Neural Network Graphs
        'networkx',           # Graph modeling and analysis
        
        # ⚡ Parallel Computing
        'dask',              # Parallel data processing
        'ray',               # Distributed ML tasks and models
        
        # 📑 Feature Engineering
        'feature-engine',    # Advanced feature transformations
    ]
    
    for library in libraries:
        try:
            __import__(library)
            print(f"✅ The library '{library}' is already installed.")
        except ImportError:
            print(f"⚠️ Installing '{library}'...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", library])
    
    print("\n🚀 All necessary libraries for Machine Learning are installed and ready to use.")

# Run the function
if __name__ == "__main__":
    install_ml_libraries()

#python install_ml.py

In [None]:
def import_ml_libraries():
    """
    Imports the most common libraries for Machine Learning and sets up basic configurations.
    """
    global pd, np, plt, sns, go
    global tf, keras, xgb, lgb, cb, sm, sp, torch, transformers
    global train_test_split, GridSearchCV, RandomizedSearchCV
    global accuracy_score, confusion_matrix, classification_report
    global optuna, shap, lime, dask, ray, imblearn
    global feature_engine
    
    try:
        # 📊 Data Manipulation
        import pandas as pd       # Tabular data manipulation
        import numpy as np        # Numerical operations
        import scipy as sp        # Mathematical and statistical calculations
        
        # 📈 Visualization
        import matplotlib.pyplot as plt  # Basic plots
        import seaborn as sns     # Advanced statistical plots
        import plotly.graph_objects as go  # Interactive plots
        
        # 🤖 Machine Learning
        import sklearn
        from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
        from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
        import xgboost as xgb     # Gradient Boosting
        import lightgbm as lgb    # Efficient Gradient Boosting
        import catboost as cb     # Optimized Gradient Boosting
        
        # 🧠 Deep Learning
        import tensorflow as tf   # Neural networks
        from tensorflow import keras
        import torch              # Deep Learning framework
        from transformers import pipeline  # NLP models (GPT, BERT)
        
        # 🔍 Model Optimization
        import optuna             # Hyperparameter optimization
        
        # 🛡️ Model Interpretability
        import shap              # Global interpretability
        import lime              # Local interpretability
        
        # ⚡ Parallel Computing
        import dask              # Parallel data processing
        import ray               # Distributed ML models and tasks
        
        # 🏗️ Data Processing
        import imblearn          # Handling imbalanced data (SMOTE, ADASYN)
        import feature_engine    # Advanced feature transformations
        
        # 📊 Statistical Analysis
        import statsmodels.api as sm  # Statistical models
        
        # ⚙️ Basic Configurations
        sns.set(style="whitegrid")
        plt.style.use('ggplot')
        pd.options.display.float_format = '{:.2f}'.format  # Display floats with two decimal places
        
        print("✅ All libraries for Machine Learning have been successfully imported.")
    
    except ImportError as e:
        print(f"❌ Error importing libraries: {e}")
        print("🔄 Make sure you have executed the library installation function first.")

# Run the function
if __name__ == "__main__":
    import_ml_libraries()

#python import_ml.py

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def encode_categorical_variables(df, method='auto', one_hot_threshold=10, preferences=None):
    """
    Converts categorical variables to numerical using One-Hot Encoding or Label Encoding.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing categorical variables.
    - method (str): 'auto' for automatic selection, 'onehot' for One-Hot Encoding, 'label' for Label Encoding.
    - one_hot_threshold (int): Maximum number of categories for applying One-Hot Encoding in 'auto' mode.
    - preferences (dict): Dictionary with specific columns and their encoding method.
      Example: {'column1': 'onehot', 'column2': 'label'}
    
    Returns:
    - pd.DataFrame: DataFrame with transformed categorical variables.
    """
    transformed_df = df.copy()
    label_encoder = LabelEncoder()
    
    for column in df.select_dtypes(include=['object', 'category']).columns:
        unique_count = df[column].nunique()
        
        # If user specifies a method for the column
        if preferences and column in preferences:
            column_method = preferences[column]
        else:
            # If no specific preference, use the general method
            if method == 'auto':
                column_method = 'onehot' if unique_count <= one_hot_threshold else 'label'
            else:
                column_method = method
        
        # Apply the selected method
        if column_method == 'onehot':
            print(f"🔹 Applying One-Hot Encoding to '{column}' (Categories: {unique_count})")
            dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
            transformed_df = pd.concat([transformed_df.drop(column, axis=1), dummies], axis=1)
        
        elif column_method == 'label':
            print(f"🔸 Applying Label Encoding to '{column}' (Categories: {unique_count})")
            transformed_df[column] = label_encoder.fit_transform(df[column])
        
        else:
            print(f"⚠️ Unrecognized method for '{column}': {column_method}")
    
    print("\n✅ Transformation completed.")
    return transformed_df


# Create a sample DataFrame
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red'],
    'Size': ['Large', 'Small', 'Medium', 'Large'],
    'Category': ['A', 'B', 'C', 'D']
})

# Automatic mode
print("🔄 Automatic Mode:")
df_auto = encode_categorical_variables(df, method='auto', one_hot_threshold=2)

# Manual preferences mode
print("\n🔄 Manual Preferences Mode:")
preferences = {'Color': 'onehot', 'Size': 'label'}
df_manual = encode_categorical_variables(df, method='auto', preferences=preferences)

# Display results
print("\n📊 Automatic Mode Result:")
print(df_auto)

print("\n📊 Manual Preferences Mode Result:")
print(df_manual)
