In [None]:
import pandas as pd

store_sales_initial = pd.read_csv('data/train.csv')

# Summarize data
print("First few rows:")
print(store_sales_initial.head())

print("\nDataFrame information:")
print(store_sales_initial.info())

print("\nDescriptive statistics for numerical columns:")
print(store_sales_initial.describe())

# Check for nulls in each column and sum them
print("\nNulls in each column:")
print(store_sales_initial.isnull().sum())

# Set the date column
store_sales_initial['data'] = pd.to_datetime(store_sales_initial['date'], errors='coerce')
print("\nStore sales initial data types:")
print(store_sales_initial.dtypes) # check the data types again

store_sales_initial['date'] = pd.to_datetime(store_sales_initial['date'])
print("\nInvalid date entries:")
print(store_sales_initial['date'].isna().sum())

First few rows:
   id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0

DataFrame information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB
None

Descriptive statistics for numerical columns:
                 id     store_nbr         sales   onpromotion
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06
mean   1.500444e+06  2.750000e+01 

In [12]:
# Summary statistics for categorical features
family_sales = store_sales_initial.groupby('family')['sales'].describe().sort_values(by='mean', ascending=True)
print("\nSales grouped by 'family':")
print(family_sales)

family_counts = store_sales_initial.groupby('family').count()
print("\nCounts per 'family':")
print(family_counts)



Sales grouped by 'family':
                              count         mean          std  min  \
family                                                               
BOOKS                       90936.0     0.070797     0.547981  0.0   
BABY CARE                   90936.0     0.110528     0.681064  0.0   
HOME APPLIANCES             90936.0     0.457476     0.969551  0.0   
HARDWARE                    90936.0     1.137833     1.638040  0.0   
MAGAZINES                   90936.0     2.929082     6.008492  0.0   
SCHOOL AND OFFICE SUPPLIES  90936.0     2.961599    21.745937  0.0   
BEAUTY                      90936.0     3.715723     4.794668  0.0   
PET SUPPLIES                90936.0     3.921263     7.657852  0.0   
LAWN AND GARDEN             90936.0     6.035475    12.283458  0.0   
AUTOMOTIVE                  90936.0     6.101236     6.191691  0.0   
PLAYERS AND ELECTRONICS     90936.0     6.186857    10.544497  0.0   
LADIESWEAR                  90936.0     7.160629    13.946302 

In [13]:
# Summary statistics for categorical features
store_nbr_sales = store_sales_initial.groupby('store_nbr')['sales'].describe().sort_values(by='mean', ascending=True)
print("\nSales grouped by 'store_nbr':")
print(store_nbr_sales)

store_nbr_counts = store_sales_initial.groupby('store_nbr').count()
print("\nCounts per 'store_nbr':")
print(store_nbr_counts)


Sales grouped by 'store_nbr':
             count         mean          std  min  25%      50%         75%  \
store_nbr                                                                     
52         55572.0    48.516694   449.115636  0.0  0.0   0.0000    0.000000   
22         55572.0    73.601845   316.225353  0.0  0.0   0.0000    4.000000   
32         55572.0   107.100626   334.004364  0.0  0.0   3.0000   80.000000   
30         55572.0   132.838006   347.580092  0.0  0.0   5.0000   98.000000   
35         55572.0   138.139340   470.106439  0.0  0.0   3.0000   87.000000   
26         55572.0   139.550887   420.371399  0.0  0.0   5.0000  120.000000   
42         55572.0   160.976173   583.518136  0.0  0.0   0.0000   16.000000   
21         55572.0   166.549808   651.285834  0.0  0.0   0.0000   12.000000   
10         55572.0   172.999096   477.274147  0.0  0.0   5.0000  141.033500   
29         55572.0   175.001038   607.751226  0.0  0.0   0.0000   20.000000   
13         55572.0   

In [17]:
# Mean 'onpromotion' per product 'family'
family_promotion = store_sales_initial.groupby('family')['onpromotion'].mean().sort_values(ascending=True)
print("\nMean 'onpromotion' per 'family'")
print(family_promotion)


Mean 'onpromotion' per 'family'
family
BOOKS                          0.000000
BABY CARE                      0.000583
HOME APPLIANCES                0.000638
HARDWARE                       0.001792
MAGAZINES                      0.003266
LADIESWEAR                     0.018475
PLAYERS AND ELECTRONICS        0.020608
LINGERIE                       0.030956
PET SUPPLIES                   0.035739
AUTOMOTIVE                     0.052597
CELEBRATION                    0.073931
GROCERY II                     0.080793
LAWN AND GARDEN                0.123252
BEAUTY                         0.148269
SCHOOL AND OFFICE SUPPLIES     0.160168
HOME AND KITCHEN II            0.360847
PREPARED FOODS                 0.412642
SEAFOOD                        0.429445
LIQUOR,WINE,BEER               0.513548
HOME AND KITCHEN I             0.515978
FROZEN FOODS                   1.381565
EGGS                           2.143508
HOME CARE                      2.168888
POULTRY                        2.489894
