## 1. Load and analyse data

In [3]:
import pandas as pd

# Loading data
file_path = '../data/raw/transactions_train.csv'
data = pd.read_csv(file_path)

# Check data structure
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.2+ GB
None
        t_dat                                        customer_id  article_id  \
0  2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1  2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
3  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
4  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   

      price  sales_channel_id  
0  0.050831                 2  
1  0.030492                 2  
2  0.015237              

## 2. Check data quality
### 2.1. Check for missing values

In [4]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
dtype: int64


The dataset doesn't contain missing values. 

### 2.2. Check for duplicate rows
Check whether we have duplicate rows


In [6]:
# Check for duplicate rows
duplicates = data.duplicated().sum()
print(f'Number of duplicates: {duplicates}')

Number of duplicates: 2974905


We have 2974905 duplicate rows. We need to drop duplicates as these may affect the quality of model.

In [8]:
data_cleaned = data.drop_duplicates()

## 2.3 Estimating the distribution of values:

Let's look at the basic statistics for columns:

In [12]:
# Basic statistics on numerical data
print(data_cleaned.describe())

# Checking for unique values in important columns
print(f"Unique customer_id: {data_cleaned['customer_id'].nunique()}")
print(f"Unique article_id: {data_cleaned['article_id'].nunique()}")

         article_id         price  sales_channel_id
count  2.881342e+07  2.881342e+07      2.881342e+07
mean   6.971462e+08  2.772868e-02      1.683251e+00
std    1.321538e+08  1.910103e-02      4.652085e-01
min    1.087750e+08  1.694915e-05      1.000000e+00
25%    6.331520e+08  1.567797e-02      1.000000e+00
50%    7.147900e+08  2.540678e-02      2.000000e+00
75%    7.871530e+08  3.388136e-02      2.000000e+00
max    9.562170e+08  5.915254e-01      2.000000e+00
Unique customer_id: 1362281
Unique article_id: 104547


## 3. Filtering 50,000 unique customer_ids:

Let's create a sample based on a random 50,000 unique customers.

In [15]:
# Filter 50,000 unique customer_id
unique_customers = data_cleaned['customer_id'].drop_duplicates().sample(50000, random_state=42)

# Filter the original dataset by the selected customer_id
sample_data = data_cleaned[data_cleaned['customer_id'].isin(unique_customers)]

# Checking sample information
print(sample_data.info())

# Save sample dataset
sample_data.to_csv('../data/processed/sample_transactions.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 1060255 entries, 97 to 31788221
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   t_dat             1060255 non-null  object 
 1   customer_id       1060255 non-null  object 
 2   article_id        1060255 non-null  int64  
 3   price             1060255 non-null  float64
 4   sales_channel_id  1060255 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 48.5+ MB
None
