<a href="https://colab.research.google.com/github/Hoodup/tt/blob/main/wallmart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/Walmart.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Data preprocessing
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [None]:
df.drop(columns=['Date'], inplace=True)

In [None]:
df.head()

In [None]:
!pip install ydata_profiling

In [None]:
from ydata_profiling import ProfileReport

In [None]:
Profile = ProfileReport(df, title = 'Walmart')
Profile.to_notebook_iframe()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['Weekly_Sales'])
plt.title('Outlier Visualization: Weekly Sales')
plt.show()

In [None]:
# Outlier handling using IQR
Q1 = df['Weekly_Sales'].quantile(0.25)
Q3 = df['Weekly_Sales'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Removing outliers
df = df[(df['Weekly_Sales'] >= lower_bound) & (df['Weekly_Sales'] <= upper_bound)]

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Visualization: Weekly Sales by Year
plt.figure(figsize=(10, 6))
sns.boxplot(x='Year', y='Weekly_Sales', data=df)
plt.title('Weekly Sales Distribution by Year')
plt.show()

In [None]:
# SALES DISTRIBUTION ACROSS STORES

# Group sales by store and sum them up
store_sales = df.groupby('Store')['Weekly_Sales'].sum().sort_values()

# Plot the bar chart
plt.figure(figsize=(14, 6))
sns.barplot(x=store_sales.index, y=store_sales.values, palette='viridis')

# Labels and title
plt.xlabel('Store ID')
plt.ylabel('Total Weekly Sales ($)')
plt.title('Total Weekly Sales per Store')
plt.xticks(rotation=90)  # Rotate store labels for readability
plt.show()

In [None]:
# Visualization: Impact of Holiday on Sales
plt.figure(figsize=(8, 5))
sns.boxplot(x='Holiday_Flag', y='Weekly_Sales', data=df)
plt.title('Weekly Sales on Holiday vs. Non-Holiday Weeks')
plt.show()

In [None]:
# Time-Series Sales Trend

df.groupby(['Year', 'Month'])['Weekly_Sales'].sum().plot(figsize=(12, 5))
plt.xlabel('Year-Month')
plt.ylabel('Total Weekly Sales')
plt.title('Walmart Sales Trend Over Time')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X = df[['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month']]
y = df['Weekly_Sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Model training
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred_rf = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluation
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [None]:
print("Random Forest Regressor Performance:")
print(f'MAE: {mae_rf:.2f}')
print(f'MSE: {mse_rf:.2f}')
print(f'R-squared: {r2_rf:.2f}')

In [None]:
# Training Score vs Test Score

print("Train Score vs Test Score")

train_score = rf_model.score(X_train, y_train)
print(f"Train Score: {train_score:.2f}")

test_score = rf_model.score(X_test, y_test)
print(f"Test Score: {test_score:.2f}")

In [None]:
# Assuming X_train and y_train are already defined
# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importance
feature_importance = rf_model.feature_importances_

# Create a DataFrame for visualization
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=importance_df['Importance'], y=importance_df['Feature'], palette='viridis')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('Feature Importance Analysis')
plt.show()
