# Walmart Sales Vorhersage

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import plotly.graph_objects as go

## 1. Einlesen & erste Sichtung

In [2]:
walmart_sales = pd.read_csv("data/Walmart_Sales.csv")
walmart_sales

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [5]:
print("Shape (Zeilen, Spalten):", walmart_sales.shape)

Shape (Zeilen, Spalten): (6435, 8)


In [7]:
print("\nDatentypen:")
print(walmart_sales.dtypes)



Datentypen:
Store             int64
Date             object
Weekly_Sales    float64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object


In [8]:
print("\nFehlende Werte pro Spalte:")
print(walmart_sales.isnull().sum())


Fehlende Werte pro Spalte:
Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64


In [3]:
X = walmart_sales.drop("Weekly_Sales", axis=1)

y = walmart_sales["Weekly_Sales"]

## 2. Zeitspalte parsen

In [12]:
walmart_sales["Date"] = pd.to_datetime(walmart_sales["Date"], dayfirst=True)

walmart_sales["Year"] = walmart_sales["Date"].dt.year
walmart_sales["Month"] = walmart_sales["Date"].dt.month
walmart_sales["WeekOfYear"] = walmart_sales["Date"].dt.isocalendar().week
walmart_sales["Quarter"] = walmart_sales["Date"].dt.quarter

print(walmart_sales.head())

   Store       Date  Weekly_Sales  ...  Month  WeekOfYear  Quarter
0      1 2010-02-05    1643690.90  ...      2           5        1
1      1 2010-02-12    1641957.44  ...      2           6        1
2      1 2010-02-19    1611968.17  ...      2           7        1
3      1 2010-02-26    1409727.59  ...      2           8        1
4      1 2010-03-05    1554806.68  ...      3           9        1

[5 rows x 12 columns]


## 3. Deskriptive Statistik & Visuals

In [15]:
# Numerische Spalten
num_cols = ["Weekly_Sales", "Temperature", "Fuel_Price", "CPI", "Unemployment"]

print("\nDeskriptive Statistik:")
print(walmart_sales[num_cols].describe().T)



Deskriptive Statistik:
               count          mean  ...           75%           max
Weekly_Sales  6435.0  1.046965e+06  ...  1.420159e+06  3.818686e+06
Temperature   6435.0  6.066378e+01  ...  7.494000e+01  1.001400e+02
Fuel_Price    6435.0  3.358607e+00  ...  3.735000e+00  4.468000e+00
CPI           6435.0  1.715784e+02  ...  2.127433e+02  2.272328e+02
Unemployment  6435.0  7.999151e+00  ...  8.622000e+00  1.431300e+01

[5 rows x 8 columns]


In [19]:
# Histogramme für jede Variable
for col_name in num_cols:
    fig = go.Figure()
    fig.add_trace(go.Histogram(
        x=walmart_sales[col_name],
        nbinsx=20,
        marker_color='royalblue'
    ))
    fig.update_layout(
        title=f"Histogramm - {col_name}",
        xaxis_title=col_name,
        yaxis_title="Häufigkeit",
        bargap=0.05
    )
    fig.show()

for col_name in num_cols:
    fig1 = go.Figure()
    fig1.add_trace(go.Box(
        y=walmart_sales[col_name],
        name=col_name,
        boxmean='sd',
        marker_color='indianred'
    ))
    fig1.update_layout(
        title=f"Boxplot - {col_name}",
        yaxis_title=col_name
    )
    fig1.show()

## 4. Korrelationen & erste Hypothesen



In [21]:
# Pearson-Korrelationen berechnen
corr_matrix = walmart_sales[num_cols].corr(method="pearson")

In [22]:
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale="RdBu",
    zmin=-1,
    zmax=1,
    colorbar=dict(title="Korrelation")
))
fig.update_layout(
    title="Korrelationsmatrix (Pearson)",
    xaxis_nticks=len(num_cols)
)
fig.show()

In [23]:
# Sortierte Korrelationen zu Weekly_Sales
corr_with_sales = corr_matrix["Weekly_Sales"].sort_values(ascending=False)
print("\nKorrelationen mit Weekly_Sales:")
print(corr_with_sales)


Korrelationen mit Weekly_Sales:
Weekly_Sales    1.000000
Fuel_Price      0.009464
Temperature    -0.063810
CPI            -0.072634
Unemployment   -0.106176
Name: Weekly_Sales, dtype: float64


## 2. Aufgabe - Baseline-Modelle

### Train/Test-Split

In [24]:
X = walmart_sales.drop("Weekly_Sales", axis=1)
y = walmart_sales["Weekly_Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline mit DummyRegressor