In [36]:
# load necessary libraries
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 

# Prepare incremental data form raw_data for kenya in the past 2 years
raw_data = pd.read_csv('data/raw_data.csv')
raw_data['order_date'] = pd.to_datetime(raw_data['order_date'], format='%m/%d/%Y')
end_date = datetime.now()
start_date = end_date.replace(year=end_date.year - 2)
kenya_data = raw_data[(raw_data['order_date'] >= start_date) & (raw_data['order_date'] <= end_date) & (raw_data['country'] == 'Kenya')]
kenya_data.head()

Unnamed: 0,order_date,customer_id,gender,product,category,quantity,unit_price,payment_method,region,country,latitude,longitude
696,2025-09-02,58-9881957,Male,Adjustable Kitchen Broom Stand,Home,2.724192,14.99,visa,,Kenya,-0.582624,35.190115
805,2025-09-29,98-9139194,Male,Tahini,Food - Condiments,4.35834,5.49,americanexpress,,Kenya,-0.499328,37.278484
3610,2024-02-18,60-5492372,Male,Sustainable Wooden Toys,Toys,0.443593,29.99,visa,,Kenya,0.636206,34.278276
4492,2025-03-10,64-2991826,Male,Blue Denim Jeans,Clothing - Bottoms,2.040665,49.99,mastercard,,Kenya,0.202957,35.097839
5846,2024-07-08,26-4259341,Male,Quinoa Salad,Food - Prepared Foods,38.751826,5.99,visa,,Kenya,-3.39731,38.555934


In [37]:
# Save the extracted data to a new CSV file
kenya_data.to_csv('data/incremental_data.csv', index=False)

In [38]:
# Load both raw and incremental data
raw_data = pd.read_csv('data/raw_data.csv')
incremental_data = pd.read_csv('data/incremental_data.csv')

# Inspect the data for any inconcistencies
print("Raw Data First look:\n")
print(pd.DataFrame(raw_data.head()))
print("\nIncremental Data First look:\n")
print(pd.DataFrame(incremental_data.head()))
print("\nRaw Data Info:\n")
print(raw_data.info())
print("\nIncremental Data Info:\n")
print(incremental_data.info())
print("\nRaw Data Descriptive Statistics:\n")
print(raw_data.describe())
print("\nIncremental Data Descriptive Statistics:\n")
print(incremental_data.describe())

Raw Data First look:

   order_date customer_id gender                             product  \
0   5/16/2022  91-1384302   Male                Sriracha Chili Sauce   
1  10/18/2023  28-8266510   Male                  Herb Garlic Butter   
2    4/1/2021  75-6932352   Male  Outdoor Mosquito Repellent Lantern   
3   1/11/2021  69-2612638   Male                     Almond Joy Bars   
4    1/6/2024  01-5279721   Male                     Sourdough Bread   

               category   quantity  unit_price   payment_method        region  \
0     Food - Condiments   2.311552        2.79       mastercard           NaN   
1     Food - Condiments  12.672940        3.99  americanexpress         Braga   
2               Outdoor  15.813123       34.99       mastercard  Pennsylvania   
3  Food - Confectionery  18.074695        1.29         bankcard           NaN   
4         Food - Bakery   9.617337        4.99     instapayment           NaN   

         country   latitude   longitude  
0          Japan

From the initial inspection both data sets have missing values in multiple columns. Another observation is the data type of the columns do not properly explain the data this may because of the CSV file format.

To further understand the data we can look at the number of missing values and see if they are simply errors or carry statistical relevance.

In [39]:
# Look at number of missing values in each column
print("\nRaw Data Missing Values:\n")
print(raw_data.isnull().sum())
print("\nIncremental Data Missing Values:\n")
print(incremental_data.isnull().sum())


Raw Data Missing Values:

order_date           0
customer_id        260
gender               0
product            144
category           280
quantity             0
unit_price           0
payment_method    2413
region            9586
country            646
latitude          3690
longitude         3751
dtype: int64

Incremental Data Missing Values:

order_date           0
customer_id         51
gender               0
product             19
category            74
quantity             0
unit_price           0
payment_method     322
region            1181
country              0
latitude           580
longitude          580
dtype: int64


In both data sets `customer_id`, `product`, `category`, `payment_method`, `region`, `longitude` and `latitude` have missing values while `country` only has missing values in *raw_data*.

`customer_id`, `product`, `country` and `category` missing values can be as a result of human error and since there relatively small size rows with these missing values can be removed.

`region`, `longitude` and `latitude` have significant amount of missing values and it is recommended to drop such columns entierly. Those interested in geolocation visulaisation , the `latitude` and `longitude` columns can be kept.

The `payment_method` columns missing values deserve further inspection to identify whether they have meaning.

In [40]:
# looking at payment_method missing values unique words
print("\nRaw Data Payment Method Unique Values:\n")
print(raw_data['payment_method'].unique())
print("\nIncremental Data Payment Method Unique Values:\n")
print(incremental_data['payment_method'].unique())


Raw Data Payment Method Unique Values:

['mastercard' 'americanexpress' 'bankcard' 'instapayment' nan 'visa']

Incremental Data Payment Method Unique Values:

['visa' 'americanexpress' 'mastercard' nan 'instapayment' 'bankcard']


Looking at the unique values we can see the payment methods represented are either credit cards or mobile transaction. Since the missing values are significant it wouldn,t be illogical to consider them as cash transactions since they are not accounted for. Later on the missing values will be kept and renamed accordingly

To ensure a proper distribution we have to ensure that there are no duplicate records or that might skew the data leading to incorrect observations.

In [41]:
# Look for duplicate records in both data sets 
print("\nRaw Data Duplicate Records:\n")
print(raw_data.duplicated().sum())
print("\nIncremental Data Duplicate Records:\n")
print(incremental_data.duplicated().sum())



Raw Data Duplicate Records:

0

Incremental Data Duplicate Records:

0
