In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# Data loading
# Загрузка данных
# -----------------------------
df = pd.read_csv("E:\Project_2\DATA\cleaned_superstore_train.csv")

# -----------------------------
# Basic dataset information
# Общая информация о датасете
# -----------------------------
print(df.info())

# -----------------------------
# Descriptive statistics for numerical features
# Описательная статистика числовых признаков
# -----------------------------
print(df.describe())

# -----------------------------
# Duplicate rows detection
# Проверка на дубликаты строк
# -----------------------------
duplicates = df.duplicated().sum()
print(f'Duplicated rows: {duplicates}')

# -----------------------------
# Missing values check
# Проверка пропущенных значений
# -----------------------------
missing = df.isnull().sum()
print(f'Missing rows: {missing}')

# =============================
# Exploratory Data Analysis (EDA)
# Разведочный анализ данных
# =============================

# -----------------------------
# Distribution analysis for numerical columns
# Анализ распределения числовых колонок
# -----------------------------
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'{col} distribution')
    plt.show()

# -----------------------------
# Frequency analysis for categorical columns
# Анализ частоты категориальных признаков
# -----------------------------
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=col)
    plt.title(f'{col} counts')
    plt.show()

# -----------------------------
# Correlation analysis between numerical features
# Корреляционный анализ числовых признаков
# -----------------------------
plt.figure(figsize=(8, 6))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# =============================
# Key Performance Indicators (KPI)
# Ключевые показатели эффективности
# =============================

# -----------------------------
# Total sales calculation
# Расчёт общей выручки
# -----------------------------
total_sales = df['sales'].sum()

# -----------------------------
# Average sales per customer
# Средняя выручка на одного клиента
# -----------------------------
avg_sales_per_customer = df.groupby('customer_id')['sales'].sum().mean()

# -----------------------------
# Top 10 products by total sales
# Топ-10 товаров по объёму продаж
# -----------------------------
top_products = df.groupby('product_name')['sales'].sum().sort_values(ascending=False).head(10)

print(f"Total Sales: {total_sales}")
print(f"Avg Sales per Customer: {avg_sales_per_customer}")
