# Iphone sales data EDA


In [61]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from itertools import combinations

In [3]:
iphone_sales_data = pd.read_csv("../dataset/iphone_sales.csv")

#### How many unique customers made purchases?

In [26]:
len(iphone_sales_data['Email'].unique())

525

#### Which customers purchased multiple products in a single transaction?

In [25]:
# iphone_sales_data.groupby("")
iphone_sales_data.loc[iphone_sales_data['Product'].str.split(",").str.len() > 1]

Unnamed: 0,Name,Email,Product,Transaction Date
4,PERSON_5,PERSON_5@gmail.com,"PRODUCT_34,PRODUCT_86,PRODUCT_57,PRODUCT_89",01/03/2021 10:56:46
5,PERSON_6,PERSON_6@gmail.com,"PRODUCT_34,PRODUCT_66,PRODUCT_58,PRODUCT_83",01/03/2021 11:06:34
7,PERSON_8,PERSON_8@gmail.com,"PRODUCT_63,PRODUCT_90,PRODUCT_27,PRODUCT_5",01/03/2021 12:31:10
11,PERSON_12,PERSON_12@gmail.com,"PRODUCT_5,PRODUCT_34",01/03/2021 13:37:45
12,PERSON_13,PERSON_13@gmail.com,"PRODUCT_84,PRODUCT_27",01/03/2021 14:17:44
...,...,...,...,...
575,PERSON_521,PERSON_521@gmail.com,"PRODUCT_92,PRODUCT_2,PRODUCT_41,PRODUCT_3,PROD...",07/03/2021 23:50:27
576,PERSON_522,PERSON_522@gmail.com,"PRODUCT_48,PRODUCT_80,PRODUCT_71,PRODUCT_68,PR...",07/03/2021 23:53:03
578,PERSON_523,PERSON_523@gmail.com,"PRODUCT_36,PRODUCT_14,PRODUCT_64,PRODUCT_28,PR...",07/03/2021 23:58:24
579,PERSON_524,PERSON_524@gmail.com,"PRODUCT_75,PRODUCT_71,PRODUCT_86,PRODUCT_63,PR...",07/03/2021 23:59:26


#### Which customers have the highest number of transactions overall?

In [30]:
iphone_sales_data.aggregate("Email").value_counts().reset_index().head(1)

Unnamed: 0,Email,count
0,PERSON_470@gmail.com,5


#### Can we identify repeat customers vs one-time buyers?

In [38]:
customer_with_frequency = iphone_sales_data.aggregate("Email").value_counts().reset_index()
# customer_with_frequency.loc[customer_with_frequency['count'] == 1]
customer_with_frequency.loc[customer_with_frequency['count'] > 1]

Unnamed: 0,Email,count
0,PERSON_470@gmail.com,5
1,PERSON_330@gmail.com,4
2,PERSON_75@gmail.com,3
3,PERSON_156@gmail.com,3
4,PERSON_290@gmail.com,3
5,PERSON_283@hotmail.com,3
6,PERSON_263@gmail.com,2
7,PERSON_91@gmail.com,2
8,PERSON_207@gmail.com,2
9,PERSON_344@gmail.com,2


## Product-Level Analysis
#### What are the most frequently purchased products?

In [45]:
products = iphone_sales_data['Product'].str.split(',').explode()
most_frequent = products.value_counts().head(1)
most_frequent

Product
PRODUCT_75    121
Name: count, dtype: int64

#### How many products are typically bought per transaction (single vs multiple purchases)?

In [55]:
iphone_sales_data["Product Count"] = iphone_sales_data['Product'].str.split(",").apply(len)
distribution = iphone_sales_data["Product Count"].value_counts()
single_vs_multiple = iphone_sales_data["Product Count"].apply(lambda x: "Single" if x == 1 else "Multiple").value_counts()
single_vs_multiple

Product Count
Single      346
Multiple    235
Name: count, dtype: int64

#### Which products are often purchased together (co-occurrence analysis)?

In [60]:
transactions = iphone_sales_data['Product'].str.split(',')

In [65]:
pair_counts = Counter()
for products in transactions:
    if len(products) > 1:
        # Create all unique pairs from a transaction
        pairs = combinations(sorted(products), 2)
        pair_counts.update(pairs)
co_occurrence_df = pd.DataFrame(pair_counts.items(), columns=["Product Pair", "Count"])
top_pairs = co_occurrence_df.sort_values(by="Count", ascending=False).head(10)
top_pairs

Unnamed: 0,Product Pair,Count
13,"(PRODUCT_27, PRODUCT_63)",28
52,"(PRODUCT_68, PRODUCT_75)",20
222,"(PRODUCT_12, PRODUCT_85)",17
20,"(PRODUCT_63, PRODUCT_75)",16
161,"(PRODUCT_12, PRODUCT_55)",15
227,"(PRODUCT_28, PRODUCT_61)",14
219,"(PRODUCT_12, PRODUCT_61)",14
257,"(PRODUCT_62, PRODUCT_85)",14
220,"(PRODUCT_12, PRODUCT_62)",14
249,"(PRODUCT_55, PRODUCT_61)",14
