In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('customer_shopping_behavior.csv') 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3863 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [3]:
df.sample(5)

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
3749,3750,24,Female,Blouse,Clothing,95,Alaska,L,Yellow,Winter,2.9,No,2-Day Shipping,No,No,38,Venmo,Every 3 Months
638,639,53,Male,Sandals,Footwear,20,Maryland,S,Blue,Summer,2.7,Yes,Express,Yes,Yes,41,PayPal,Monthly
1266,1267,20,Male,Blouse,Clothing,31,North Dakota,M,Peach,Fall,4.0,No,Standard,Yes,Yes,14,Debit Card,Every 3 Months
1877,1878,69,Male,Gloves,Accessories,62,Pennsylvania,L,Pink,Fall,3.6,No,Express,No,No,3,Credit Card,Annually
3119,3120,63,Female,Sandals,Footwear,79,Louisiana,XL,Silver,Winter,4.7,No,2-Day Shipping,No,No,21,Credit Card,Every 3 Months


In [4]:
df.isnull().sum()

Customer ID                0
Age                        0
Gender                     0
Item Purchased             0
Category                   0
Purchase Amount (USD)      0
Location                   0
Size                       0
Color                      0
Season                     0
Review Rating             37
Subscription Status        0
Shipping Type              0
Discount Applied           0
Promo Code Used            0
Previous Purchases         0
Payment Method             0
Frequency of Purchases     0
dtype: int64

In [6]:
# changing to lowercase
# changing to snake case

df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df = df.rename(columns={'purchase_amount_(usd)':'bill_value'})
df.columns 

Index(['customer_id', 'age', 'gender', 'item_purchased', 'category',
       'bill_value', 'location', 'size', 'color', 'season', 'review_rating',
       'subscription_status', 'shipping_type', 'discount_applied',
       'promo_code_used', 'previous_purchases', 'payment_method',
       'frequency_of_purchases'],
      dtype='object')

In [7]:
null_category_wise = df.loc[df['review_rating'].isnull() , ['category', 'review_rating']]
null_category_wise.value_counts('category')

category
Clothing       19
Accessories    11
Footwear        5
Outerwear       2
Name: count, dtype: int64

In [8]:
null_item_wise = df.loc[df['review_rating'].isnull() , ['item_purchased', 'review_rating']]
null_item_wise.value_counts('item_purchased')

item_purchased
Shoes         3
Sweater       3
Pants         3
Gloves        3
T-shirt       2
Coat          2
Blouse        2
Belt          2
Scarf         2
Sunglasses    2
Hoodie        2
Dress         2
Shorts        2
Boots         1
Hat           1
Jewelry       1
Sandals       1
Shirt         1
Skirt         1
Socks         1
Name: count, dtype: int64

In [9]:
# replacing missing review rating with median on item wise 
df['review_rating'] = df.groupby('item_purchased')['review_rating'].transform(lambda x:x.fillna(x.median()))
df.isnull().sum()

customer_id               0
age                       0
gender                    0
item_purchased            0
category                  0
bill_value                0
location                  0
size                      0
color                     0
season                    0
review_rating             0
subscription_status       0
shipping_type             0
discount_applied          0
promo_code_used           0
previous_purchases        0
payment_method            0
frequency_of_purchases    0
dtype: int64

In [10]:
df.sample(5)

Unnamed: 0,customer_id,age,gender,item_purchased,category,bill_value,location,size,color,season,review_rating,subscription_status,shipping_type,discount_applied,promo_code_used,previous_purchases,payment_method,frequency_of_purchases
1356,1357,49,Male,Jacket,Outerwear,61,Indiana,M,Cyan,Spring,4.0,No,Free Shipping,Yes,Yes,34,Bank Transfer,Annually
3693,3694,66,Female,Blouse,Clothing,46,South Dakota,M,Violet,Summer,4.7,No,2-Day Shipping,No,No,10,Venmo,Weekly
115,116,53,Male,Skirt,Clothing,94,Wyoming,M,Gray,Spring,3.8,Yes,Standard,Yes,Yes,48,Debit Card,Every 3 Months
2246,2247,49,Male,Jacket,Outerwear,96,Hawaii,M,Cyan,Fall,4.3,No,Store Pickup,No,No,39,PayPal,Every 3 Months
3873,3874,42,Female,Sandals,Footwear,43,Louisiana,L,Silver,Summer,4.0,No,Standard,No,No,22,Credit Card,Weekly


In [11]:
df['age'].describe()

count    3900.000000
mean       44.068462
std        15.207589
min        18.000000
25%        31.000000
50%        44.000000
75%        57.000000
max        70.000000
Name: age, dtype: float64

In [12]:
def age_sep(age):
    if 18 <= age < 30:
        return 'young'
    elif 30 <= age < 44:
        return 'adult'
    elif 44 <= age < 57:
        return 'middle_aged'
    elif 57 <= age <= 70:
        return 'senior'
    else:
        return 'no age'

df['age_category'] = df['age'].apply(age_sep)

In [13]:
# categorising age in 4 group

age_category = ['young', 'adult' , 'middle_aged' , 'senior']
df['age_category'] = pd.qcut(df['age'], q = 4 , labels=age_category)

In [16]:
df[['age', 'age_category']].sample(10)

Unnamed: 0,age,age_category
2835,33,adult
1344,56,middle_aged
3102,51,middle_aged
3541,59,senior
871,47,middle_aged
317,58,senior
71,36,adult
1339,36,adult
594,35,adult
439,62,senior


In [17]:
df['frequency_of_purchases'].value_counts()

frequency_of_purchases
Every 3 Months    584
Annually          572
Quarterly         563
Monthly           553
Bi-Weekly         547
Fortnightly       542
Weekly            539
Name: count, dtype: int64

In [18]:
# Converting frequency of purchase to numeric

purchase_frequency_cat = {
    'Every 3 Months' : 90,
    'Annually' : 365,
    'Quarterly' : 90,
    'Monthly' : 30, 
    'Bi-Weekly' : 14,
    'Fortnightly' : 14,
    'Weekly' : 7

}

df['purchase_frequency_days'] = df['frequency_of_purchases'].map(purchase_frequency_cat)
df[['frequency_of_purchases', 'purchase_frequency_days']].sample(10)

Unnamed: 0,frequency_of_purchases,purchase_frequency_days
127,Weekly,7
129,Bi-Weekly,14
578,Fortnightly,14
2234,Weekly,7
2519,Quarterly,90
3311,Annually,365
3277,Monthly,30
3177,Monthly,30
488,Weekly,7
256,Every 3 Months,90


In [19]:
df.columns

Index(['customer_id', 'age', 'gender', 'item_purchased', 'category',
       'bill_value', 'location', 'size', 'color', 'season', 'review_rating',
       'subscription_status', 'shipping_type', 'discount_applied',
       'promo_code_used', 'previous_purchases', 'payment_method',
       'frequency_of_purchases', 'age_category', 'purchase_frequency_days'],
      dtype='object')

In [20]:
# both columns are litreally same 

(df['discount_applied'] == df['promo_code_used']).value_counts()


True    3900
Name: count, dtype: int64

In [21]:
# dropping prmoo_code_used

df.drop(labels='promo_code_used', axis=1, inplace=True)
df.columns

Index(['customer_id', 'age', 'gender', 'item_purchased', 'category',
       'bill_value', 'location', 'size', 'color', 'season', 'review_rating',
       'subscription_status', 'shipping_type', 'discount_applied',
       'previous_purchases', 'payment_method', 'frequency_of_purchases',
       'age_category', 'purchase_frequency_days'],
      dtype='object')

In [29]:
from dotenv import load_dotenv
from sqlalchemy import create_engine
import os

load_dotenv()

username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
database = os.getenv("DB_NAME")

engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}/{database}")
try:
    with engine.connect() as connection:
        print("Connection successful!")
except Exception as e:
    print("Connection failed:")
    print(e)


Connection successful!


In [30]:
# df as table to sql

df.to_sql(
    name='customer_data',        # table name to create
    con=engine,                  # SQLAlchemy engine
    if_exists='replace',         # 'replace' = drops old table, creates new one
    index=False,                 # donâ€™t write pandas index as a column
    chunksize=1000               # optional, useful for large datasets
)


3900

In [31]:
pd.read_sql("SELECT * FROM customer_data LIMIT 5;", engine)


Unnamed: 0,customer_id,age,gender,item_purchased,category,bill_value,location,size,color,season,review_rating,subscription_status,shipping_type,discount_applied,previous_purchases,payment_method,frequency_of_purchases,age_category,purchase_frequency_days
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,14,Venmo,Fortnightly,middle_aged,14
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,2,Cash,Fortnightly,young,14
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,23,Credit Card,Weekly,middle_aged,7
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,49,PayPal,Weekly,young,7
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,31,PayPal,Annually,middle_aged,365
