In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart/Walmart.csv


In [2]:
df = pd.read_csv("/kaggle/input/walmart/Walmart.csv")

In [None]:
def modeling():
    import matplotlib.pyplot as plt
    import seaborn as sns
    import time
    from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
    from sklearn.preprocessing import StandardScaler,OneHotEncoder, OrdinalEncoder
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline

    
    
    # ML Modelleri
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from xgboost import XGBRegressor
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.svm import SVR
    from sklearn.neural_network import MLPRegressor

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

    size_order = [['Budget','Mid-Range','Upper Mid-Range','Luxury','Ultra Luxury']]
    
    # Create column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), nominal_features),
            ('ordinal', OrdinalEncoder(categories=size_order), ordinal_features),
        ],
        remainder='passthrough'
    )

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    
    Sc = StandardScaler()
    X_train = Sc.fit_transform(X_train)
    X_test = Sc.transform(X_test)

    models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, min_samples_split=4, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=10, weights="distance", metric="minkowski"),
    "SVR": SVR(kernel='rbf', C=100, epsilon=0.1, gamma='scale'),
    "Neural Network (MLP)": MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=1000, random_state=42)
    }

    results = []

    for name, model in models.items():
        print(f"Training {name}...")
    
        # Training
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        
        # Prediction
        start_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_time
        
        # Performance Metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
    
        # Store results
        results.append([name, mae, mse, r2, train_time, predict_time])
    
    results_df = pd.DataFrame(results, columns=["Model", "MAE", "MSE", "R² Score", "Training Time (sec)", "Prediction Time (sec)"])
    from IPython.display import display
    display(results_df)

    plt.figure(figsize=(12,6))
    sns.barplot(data=results_df, x="Model", y="R² Score", palette="viridis")
    plt.xticks(rotation=45)
    plt.title("Optimized Model Comparison - R² Score")
    plt.show()

    

In [None]:
def early_stage:
    from IPython.display import display
    display(df)
    display(df.info())
    display(df.dtypes)
    display(df.isnull().sum())
    display(df.isduplicate())

def EDA(num_cols,cat_cols,df):
    import matplotlib.pyplot as plt
    import seaborn as sns
    if cat_cols:
        num_cols = 2  # Number of subplots per row
        total = len(columns)
        num_rows = math.ceil(total / num_cols)
    
        plt.figure(figsize=(num_cols * 5, num_rows * 4))
    
        for idx, col in enumerate(columns):
            plt.subplot(num_rows, num_cols, idx + 1)
            sns.countplot(data=df, x=col, order=df[col].value_counts().index)
            plt.title(col)
            plt.xticks(rotation=45)
            plt.tight_layout()
    
        plt.show()    
    
        pairs = list(itertools.combinations(columns, 2))
        total = len(pairs)
        num_cols = 2  # 2 plots per row
        num_rows = math.ceil(total / num_cols)
    
        plt.figure(figsize=(num_cols * 6, num_rows * 5))
    
        for idx, (col1, col2) in enumerate(pairs):
            plt.subplot(num_rows, num_cols, idx + 1)
            sns.countplot(data=df, x=col1, hue=col2)
            plt.title(f"{col1} vs {col2}")
            plt.xticks(rotation=45)
            plt.tight_layout()
    
        plt.show()

    if num_cols:
        for col in columns:
            fig, axs = plt.subplots(1, 2, figsize=(12, 4))
            
            # Title for the whole row
            fig.suptitle(col, fontsize=14, fontweight='bold', y=1.05)
    
            # Histogram
            sns.histplot(df[col], kde=True, bins=30, ax=axs[0])
            axs[0].set_title('Histogram')
            axs[0].set_xlabel(col)
            axs[0].set_ylabel('Frequency')
    
            # Boxplot
            sns.boxplot(y=df[col], ax=axs[1])
            axs[1].set_title('Boxplot')
            axs[1].set_ylabel(col)
    
            plt.tight_layout()
            plt.show()

        
        pairs = list(itertools.combinations(columns, 2))
        total = len(pairs)
        num_cols = 2
        num_rows = math.ceil(total / num_cols)
    
        plt.figure(figsize=(num_cols * 6, num_rows * 5))
    
        for idx, (col1, col2) in enumerate(pairs):
            plt.subplot(num_rows, num_cols, idx + 1)
            sns.scatterplot(data=df, x=col1, y=col2)
            plt.title(f"{col1} vs {col2}")
            plt.tight_layout()
    
        plt.show()

    pairs = list(itertools.product(cat_cols, num_cols))

    if cat_cols and num_cols:
        for cat_col, num_col in pairs:
            plt.figure(figsize=(12, 5))
            
            # Title for this pair
            plt.suptitle(f'{num_col} vs {cat_col}', fontsize=14, fontweight='bold')
    
            # --- Left: Distribution plot (histogram per category) ---
            plt.subplot(1, 2, 1)
            sns.kdeplot(x=df[num_col], hue=df[cat_col],common_norm=False)
            plt.title(f'Distribution of {num_col} by {cat_col}')
    
            # --- Right: Barplot (mean of num_col per category) ---
            plt.subplot(1, 2, 2)
            sns.barplot(data=df, x=cat_col, y=num_col, estimator='mean', ci='sd')
            plt.title(f'Mean {num_col} by {cat_col}')
            plt.xticks(rotation=45)
    
            plt.tight_layout()
            plt.show()

In [None]:
param_grids = {

    "LinearRegression": {
        "fit_intercept": [True, False],
        "positive": [True, False]
    },

    "Ridge": {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
        "solver": ["auto", "svd", "cholesky", "lsqr"]
    },

    "Lasso": {
        "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
        "max_iter": [1000, 5000, 10000]
    },

    "ElasticNet": {
        "alpha": [0.001, 0.01, 0.1, 1, 10],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
        "max_iter": [1000, 5000]
    },

    "DecisionTreeRegressor": {
        "criterion": ["squared_error", "friedman_mse", "absolute_error"],
        "max_depth": [None, 5, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
        "max_features": [None, "sqrt", "log2"]
    },

    "RandomForestRegressor": {
        "n_estimators": [100, 300, 500],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2"],
        "bootstrap": [True, False]
    },

    "GradientBoostingRegressor": {
        "n_estimators": [100, 300, 500],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],
        "subsample": [0.6, 0.8, 1.0],
        "min_samples_split": [2, 5, 10]
    },

    "XGBRegressor": {
        "n_estimators": [200, 500, 800],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7, 10],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "reg_alpha": [0, 0.1, 1],     # L1 regularization
        "reg_lambda": [1, 5, 10]     # L2 regularization
    },

    "KNeighborsRegressor": {
        "n_neighbors": [3, 5, 7, 9, 15],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"]
    },

    "SVR": {
        "kernel": ["linear", "rbf", "poly"],
        "C": [0.1, 1, 10, 100],
        "epsilon": [0.01, 0.1, 0.2],
        "gamma": ["scale", "auto"]
    },

    "MLPRegressor": {
        "hidden_layer_sizes": [(50,), (100,), (50,50), (100,50)],
        "activation": ["relu", "tanh"],
        "solver": ["adam", "lbfgs"],
        "alpha": [0.0001, 0.001, 0.01],   # L2 regularization
        "learning_rate": ["constant", "adaptive"],
        "max_iter": [500, 1000]
    }
}
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


MODELS = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
    "XGBRegressor": XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        random_state=42
    ),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "SVR": SVR(),
    "MLPRegressor": MLPRegressor(random_state=42)
}

from sklearn.model_selection import GridSearchCV

def run_grid_search(
    model_name,
    X_train,
    y_train,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=1
):
    from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.svm import SVR
    from sklearn.neural_network import MLPRegressor
    from xgboost import XGBRegressor
    
    
    MODELS = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(),
        "Lasso": Lasso(),
        "ElasticNet": ElasticNet(),
        "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
        "RandomForestRegressor": RandomForestRegressor(random_state=42),
        "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
        "XGBRegressor": XGBRegressor(
            objective="reg:squarederror",
            eval_metric="rmse",
            random_state=42
        ),
        "KNeighborsRegressor": KNeighborsRegressor(),
        "SVR": SVR(),
        "MLPRegressor": MLPRegressor(random_state=42)
    }
    param_grids = {

    "LinearRegression": {
        "fit_intercept": [True, False],
        "positive": [True, False]
    },

    "Ridge": {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
        "solver": ["auto", "svd", "cholesky", "lsqr"]
    },

    "Lasso": {
        "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
        "max_iter": [1000, 5000, 10000]
    },

    "ElasticNet": {
        "alpha": [0.001, 0.01, 0.1, 1, 10],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
        "max_iter": [1000, 5000]
    },

    "DecisionTreeRegressor": {
        "criterion": ["squared_error", "friedman_mse", "absolute_error"],
        "max_depth": [None, 5, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
        "max_features": [None, "sqrt", "log2"]
    },

    "RandomForestRegressor": {
        "n_estimators": [100, 300, 500],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2"],
        "bootstrap": [True, False]
    },

    "GradientBoostingRegressor": {
        "n_estimators": [100, 300, 500],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],
        "subsample": [0.6, 0.8, 1.0],
        "min_samples_split": [2, 5, 10]
    },

    "XGBRegressor": {
        "n_estimators": [200, 500, 800],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7, 10],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "reg_alpha": [0, 0.1, 1],     # L1 regularization
        "reg_lambda": [1, 5, 10]     # L2 regularization
    },

    "KNeighborsRegressor": {
        "n_neighbors": [3, 5, 7, 9, 15],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"]
    },

    "SVR": {
        "kernel": ["linear", "rbf", "poly"],
        "C": [0.1, 1, 10, 100],
        "epsilon": [0.01, 0.1, 0.2],
        "gamma": ["scale", "auto"]
    },

    "MLPRegressor": {
        "hidden_layer_sizes": [(50,), (100,), (50,50), (100,50)],
        "activation": ["relu", "tanh"],
        "solver": ["adam", "lbfgs"],
        "alpha": [0.0001, 0.001, 0.01],   # L2 regularization
        "learning_rate": ["constant", "adaptive"],
        "max_iter": [500, 1000]
    }
}
    if model_name not in MODELS:
        raise ValueError(f"Model '{model_name}' not supported")

    model = MODELS[model_name]
    param_grid = param_grids[model_name]

    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=n_jobs,
        verbose=verbose
    )

    grid.fit(X_train, y_train)

    return {
        "best_model": grid.best_estimator_,
        "best_params": grid.best_params_,
        "best_score": grid.best_score_
    }
