In [2]:
import pandas as pd

orders_payments_df = pd.read_csv("data/olist_order_payments_dataset.csv")

orders_payments_df.head(10)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
5,298fcdf1f73eb413e4d26d01b25bc1cd,1,credit_card,2,96.12
6,771ee386b001f06208a7419e4fc1bbd7,1,credit_card,1,81.16
7,3d7239c394a212faae122962df514ac7,1,credit_card,3,51.84
8,1f78449c87a54faf9e96e88ba1491fa9,1,credit_card,6,341.09
9,0573b5e23cbd798006520e1d5b4c6714,1,boleto,1,51.95


### Transformation

* Checking data set information for data types and Null values:
* No Null Values

In [3]:
orders_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


* Number of rows , columns

In [4]:
orders_payments_df.shape

(103886, 5)

* Name of columns

In [5]:
orders_payments_df.columns

Index(['order_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value'],
      dtype='object')

* Statistical values of the data frame

In [7]:
orders_payments_df.describe()

Unnamed: 0,payment_sequential,payment_installments,payment_value
count,103886.0,103886.0,103886.0
mean,1.092679,2.853349,154.10038
std,0.706584,2.687051,217.494064
min,1.0,0.0,0.0
25%,1.0,1.0,56.79
50%,1.0,1.0,100.0
75%,1.0,4.0,171.8375
max,29.0,24.0,13664.08


* ensure there is no Nan/Null values

In [6]:
orders_payments_df.isna().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

* To check the total count of True and False values for each column after identifying duplicates
* Outcome: there is no duplicate values in all columns

In [13]:
duplicates = orders_payments_df.duplicated()
print(duplicates.value_counts())


False    103886
Name: count, dtype: int64


* Checking for Unique values for Columns: 'payment_sequential' , 'payment_type', 'payment_installment'

In [17]:
# Unique values for payment_sequential sorted
unique_payment_sequential_sorted = orders_payments_df['payment_sequential'].unique()
unique_payment_sequential_sorted.sort()
print("Unique values for payment_sequential sorted:", unique_payment_sequential_sorted)

# Unique values for payment_type
unique_payment_type = orders_payments_df['payment_type'].unique()
print("Unique values for payment_type:", unique_payment_type)


# Unique values for payment_installments sorted
unique_payment_installments_sorted = orders_payments_df['payment_installments'].unique()
unique_payment_installments_sorted.sort()
print("Unique values for payment_installments sorted:", unique_payment_installments_sorted)


Unique values for payment_sequential sorted: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29]
Unique values for payment_type: ['credit_card' 'boleto' 'voucher' 'debit_card' 'not_defined']
Unique values for payment_installments sorted: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 20 21 22 23 24]


#### Column :'payment_sequential'
* If the 'payment_sequential' values range between 1 and 29, it likely indicates that a customer has made multiple payments for an order, with each payment recorded in sequential order.
  
* This scenario could occur if a customer uses multiple payment methods or if payments are processed in installments. For example, if a customer initially pays a portion of the order amount using one payment method and then pays the remaining amount using another method, each of these payments would be assigned a sequential number to track their order.

* Understanding the sequential numbering helps in analyzing the payment behavior of customers and identifying any patterns or trends in how they choose to pay for their orders.

* 95.64 % make 1 full lump sum payment when ordering.





In [19]:
sequential_counts = orders_payments_df.groupby('payment_sequential').size()
print(sequential_counts)


payment_sequential
1     99360
2      3039
3       581
4       278
5       170
6       118
7        82
8        54
9        43
10       34
11       29
12       21
13       13
14       10
15        8
16        6
17        6
18        6
19        6
20        4
21        4
22        3
23        2
24        2
25        2
26        2
27        1
28        1
29        1
dtype: int64


In [25]:
# Count of each sequential value
sequential_counts = orders_payments_df.groupby('payment_sequential').size()

# Total number of customers
total_customers = len(orders_payments_df)

# Calculate percentage for each sequential value
percentage_customers = (sequential_counts / total_customers) * 100

# Display count and percentage for each sequential value
for sequential_value, count in sequential_counts.items():
    percentage = (count / total_customers) * 100
    print(f"{sequential_value}, Total_count: {count} : {percentage:.2f}%")


1, Total_count: 99360 : 95.64%
2, Total_count: 3039 : 2.93%
3, Total_count: 581 : 0.56%
4, Total_count: 278 : 0.27%
5, Total_count: 170 : 0.16%
6, Total_count: 118 : 0.11%
7, Total_count: 82 : 0.08%
8, Total_count: 54 : 0.05%
9, Total_count: 43 : 0.04%
10, Total_count: 34 : 0.03%
11, Total_count: 29 : 0.03%
12, Total_count: 21 : 0.02%
13, Total_count: 13 : 0.01%
14, Total_count: 10 : 0.01%
15, Total_count: 8 : 0.01%
16, Total_count: 6 : 0.01%
17, Total_count: 6 : 0.01%
18, Total_count: 6 : 0.01%
19, Total_count: 6 : 0.01%
20, Total_count: 4 : 0.00%
21, Total_count: 4 : 0.00%
22, Total_count: 3 : 0.00%
23, Total_count: 2 : 0.00%
24, Total_count: 2 : 0.00%
25, Total_count: 2 : 0.00%
26, Total_count: 2 : 0.00%
27, Total_count: 1 : 0.00%
28, Total_count: 1 : 0.00%
29, Total_count: 1 : 0.00%


#### Column:  payment_type"
* Most famous payment method : credit_card
* boleto : In Brazil, "boleto" is a prevalent payment method offering customers the flexibility to pay for purchases online or at physical locations using payment slips. It serves as a convenient alternative for individuals without credit cards or those who prefer not to use them, contributing to its widespread adoption and popularity in the country.

* not_defined" payment type, it's likely a placeholder or default value used in the dataset to represent instances where the payment type is not specified or not available. This could occur due to various reasons such as missing data, errors in data entry.
* drop the "not defined values since its only 3 rows.

In [28]:
# Group orders by payment type and calculate count
payment_type_counts = orders_payments_df['payment_type'].value_counts()
print(payment_type_counts)

# Total number of orders
total_orders = len(orders_payments_df)

# Calculate percentage for each payment type
payment_type_percentage = (payment_type_counts / total_orders) * 100

print(payment_type_percentage)


payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: count, dtype: int64
payment_type
credit_card    73.922376
boleto         19.043952
voucher         5.558978
debit_card      1.471806
not_defined     0.002888
Name: count, dtype: float64


In [29]:
# Drop rows with "not-defined" values in the payment_type column
orders_payments_df.drop(orders_payments_df[orders_payments_df['payment_type'] == 'not_defined'].index, inplace=True)


In [30]:
payment_type_counts = orders_payments_df['payment_type'].value_counts()
print(payment_type_counts)

payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
Name: count, dtype: int64


In [None]:
# Assert that there are no more rows with "not_defined" values in the payment_type column
assert (orders_payments_df['payment_type'] == 'not_defined').sum() == 0, "There are still rows with 'not_defined' values"
