## a)

In [6]:
import pandas as pd

customer_data = pd.read_csv('./data/customer_data.csv')
sales_data = pd.read_csv('./data/sales_data.csv')

customer_data

Unnamed: 0,customer_id,gender,age,payment_method
0,C241288,Female,28.0,Credit Card
1,C111565,Male,21.0,Debit Card
2,C266599,Male,20.0,Cash
3,C988172,Female,66.0,Credit Card
4,C189076,Female,53.0,Cash
...,...,...,...,...
99452,C441542,Female,45.0,Credit Card
99453,C569580,Male,27.0,Cash
99454,C103292,Male,63.0,Debit Card
99455,C800631,Male,56.0,Cash


In [42]:
sales_data

Unnamed: 0,invoice_no,customer_id,category,quantity,price,invoice_date,shopping_mall
0,I138884,C241288,Clothing,5,1500.40,05-08-2022,Kanyon
1,I317333,C111565,Shoes,3,1800.51,12-12-2021,Forum Istanbul
2,I127801,C266599,Clothing,1,300.08,09-11-2021,Metrocity
3,I173702,C988172,Shoes,5,3000.85,16-05-2021,Metropol AVM
4,I337046,C189076,Books,4,60.60,24-10-2021,Kanyon
...,...,...,...,...,...,...,...
99452,I219422,C441542,Souvenir,5,58.65,21-09-2022,Kanyon
99453,I325143,C569580,Food & Beverage,2,10.46,22-09-2021,Forum Istanbul
99454,I824010,C103292,Food & Beverage,2,10.46,28-03-2021,Metrocity
99455,I702964,C800631,Technology,4,4200.00,16-03-2021,Istinye Park


## b)

### 1)

In [11]:
print('Missing data in customer_data:')
print(customer_data.isna().sum())

print('\nMissing data in sales_data:')
print(sales_data.isna().sum())

Missing data in customer_data:
customer_id         0
gender              0
age               119
payment_method      0
dtype: int64

Missing data in sales_data:
invoice_no       0
customer_id      0
category         0
quantity         0
price            0
invoice_date     0
shopping_mall    0
dtype: int64


### 2)

In [22]:
print('Number of product categories:')
print(sales_data['category'].nunique())

print('\nNumber of products purchased by category:')
print(sales_data.groupby('category')['quantity'].sum().sort_values(ascending=False))

print('\nClothing was the most popular category.')

Number of product categories:
8

Number of products purchased by category:
category
Clothing           103558
Cosmetics           45465
Food & Beverage     44277
Toys                30321
Shoes               30217
Technology          15021
Books               14982
Souvenir            14871
Name: quantity, dtype: int64

Clothing was the most popular category.


### 3)

In [39]:
merged = sales_data.merge(customer_data, on='customer_id', how='inner')

f_sales = merged[merged['gender'] == 'Female']
m_sales = merged[merged['gender'] == 'Male']

print('Top three categories for women:')
print(f_sales.groupby('category')['quantity'].sum().sort_values(ascending=False).head(3))

print('\nTop three categories for men:')
print(m_sales.groupby('category')['quantity'].sum().sort_values(ascending=False).head(3))

Top three categories for women:
category
Clothing           62039
Cosmetics          27261
Food & Beverage    26362
Name: quantity, dtype: int64

Top three categories for men:
category
Clothing           41519
Cosmetics          18204
Food & Beverage    17915
Name: quantity, dtype: int64


### 4)

In [46]:
print('Items purchased with each payment method:')
print(merged.groupby('payment_method')['quantity'].sum().sort_values(ascending=False))

print('\nDollars spent with each payment method:')
print(merged.groupby('payment_method')['price'].sum().sort_values(ascending=False))

print('\nCash is the most popular payment method')

Items purchased with each payment method:
payment_method
Cash           133370
Credit Card    105045
Debit Card      60297
Name: quantity, dtype: int64

Dollars spent with each payment method:
payment_method
Cash           30705030.98
Credit Card    24051476.93
Debit Card     13794858.00
Name: price, dtype: float64

Cash is the most popular payment method


### 5)

In [68]:
sales_data['invoice_date'] = pd.to_datetime(sales_data['invoice_date'], format='%d-%m-%Y')

print(sales_data.groupby(sales_data['invoice_date'].dt.month)['quantity'].sum().sort_values(ascending=False))
print('\nJanuary has the highest number of sales')

invoice_date
1     34763
2     31343
3     26224
7     23699
10    23477
5     23217
12    23152
8     22921
6     22674
11    22586
4     22558
9     22098
Name: quantity, dtype: int64

January has the highest number of sales


### 6)

In [78]:
# Create new column indicating the next multiple of 500 after current price

price_group = []
for _, row in sales_data.iterrows():
    price_group.append(int(int(row['price'])/500 + 1)*500)
sales_data['price_group'] = price_group

sales_bins = sales_data.groupby('price_group')

#### a)

In [85]:
print('The highest sales group is:', sales_data['price_group'].max())
print()
print(sales_data[sales_data['price_group'] == sales_data['price_group'].max()])

The highest sales group is: 5500

             invoice_no customer_id    category  quantity   price  \
invoice_date                                                        
2021-10-26      I252275     C313348  Technology         5  5250.0   
2021-11-19      I883721     C236859  Technology         5  5250.0   
2022-01-03      I194017     C462011  Technology         5  5250.0   
2022-10-04      I252528     C333065  Technology         5  5250.0   
2021-04-21      I311849     C178487  Technology         5  5250.0   
...                 ...         ...         ...       ...     ...   
2022-09-08      I236291     C724141  Technology         5  5250.0   
2023-01-24      I285991     C894009  Technology         5  5250.0   
2021-06-26      I282417     C247846  Technology         5  5250.0   
2022-02-05      I223483     C877031  Technology         5  5250.0   
2022-03-10      I378781     C324387  Technology         5  5250.0   

             invoice_date     shopping_mall  price_group  
invoice_d

#### b)

In [89]:
print(sales_bins['quantity'].sum().sort_values(ascending=False))

print('\nThe group of items between 0 and 500 had the most purchases')

price_group
500     156757
2000     40730
1000     36565
1500     32471
3500     13092
2500     10088
5500      5005
4500      4004
Name: quantity, dtype: int64

The group of items between 0 and 500 had the most purchases


#### c)

In [96]:
january_sales = sales_data[sales_data['invoice_date'].dt.month == 1]

print(sales_bins['quantity'].sum().sort_values(ascending=False))

print('\nPurchases between 0 and 500 dollars are still the most common')

price_group
500     18431
2000     4671
1000     4190
1500     3839
3500     1323
2500     1236
5500      585
4500      488
Name: quantity, dtype: int64

Purchases between 0 and 500 dollars are still the most common
