In [78]:
import numpy as np
import pandas as pd

In [79]:
# Importing all tables

customer = pd.read_csv('data/raw/customers.csv')
ticket = pd.read_csv('data/raw/support_tickets.csv')
campaign = pd.read_csv('data/raw/campaigns.csv')
review = pd.read_csv('data/raw/customer_reviews_complete.csv')
transaction = pd.read_csv('data/raw/transactions.csv')
interaction = pd.read_csv('data/raw/interactions.csv')

## Cleaning customer table

In [80]:
customer.sample()

Unnamed: 0,customer_id,full_name,age,gender,email,phone,street_address,city,state,zip_code,registration_date,preferred_channel
4834,44dd5bfc-65ef-487d-a8cc-89a5fd4b583e,Amanda Wright,18.0,Female,amanda_wright@hotmail.com,(824)919-0052x10450,629 Shepherd Course Suite 743,Chicago,Illinois,42967.0,3/12/2024,online


In [81]:
customer.shape

(5000, 12)

In [82]:
customer.isnull().sum()

customer_id            0
full_name            106
age                  186
gender               112
email                111
phone                186
street_address       177
city                  97
state                 93
zip_code              96
registration_date      0
preferred_channel    114
dtype: int64

In [83]:
customer.duplicated().sum()

np.int64(0)

In [84]:
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        5000 non-null   object 
 1   full_name          4894 non-null   object 
 2   age                4814 non-null   float64
 3   gender             4888 non-null   object 
 4   email              4889 non-null   object 
 5   phone              4814 non-null   object 
 6   street_address     4823 non-null   object 
 7   city               4903 non-null   object 
 8   state              4907 non-null   object 
 9   zip_code           4904 non-null   float64
 10  registration_date  5000 non-null   object 
 11  preferred_channel  4886 non-null   object 
dtypes: float64(2), object(10)
memory usage: 468.9+ KB


In [85]:
customer.drop(columns=['email','phone','zip_code','street_address'],inplace=True)

In [86]:
customer['age'] = pd.to_numeric(customer['age'],errors='coerce').astype('Int64')
customer['registration_date'] = pd.to_datetime(customer['registration_date'],errors='coerce')

In [87]:
customer['full_name'] = customer['full_name'].fillna('Unknown')
customer['gender'] = customer['gender'].fillna('Unknown')
customer['city'] = customer['city'].fillna('Unknown')
customer['state'] = customer['state'].fillna('Unknown')
customer['preferred_channel'] = customer['preferred_channel'].fillna('Unknown')

In [88]:
customer['age'] = customer['age'].fillna(customer['age'].median())

In [89]:
customer['year'] = customer["registration_date"].dt.year

In [90]:
customer['age_group'] = pd.cut(
    customer['age'],
    bins=[1,18, 35, 45, 60, 100],
    labels=['1-17','18-35', '36-45', '46-60', '60+'],
    right=True
)

## Cleaning support_ticket table

In [91]:
ticket.sample()

Unnamed: 0,ticket_id,customer_id,issue_category,priority,submission_date,resolution_date,resolution_status,resolution_time_hours,customer_satisfaction_score,notes
181,3b4e3bf1-07d2-46a3-8e4e-4ce02dd7d4a8,b98f8920-d8d5-4a96-b424-fd1a2591fced,Billing,Low,10/21/2024 0:00,10/25/2024 14:00,Resolved,110.0,3.0,Customer was charged twice for the same MacBook.


In [92]:
ticket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ticket_id                    3000 non-null   object 
 1   customer_id                  3000 non-null   object 
 2   issue_category               2929 non-null   object 
 3   priority                     2940 non-null   object 
 4   submission_date              3000 non-null   object 
 5   resolution_date              2756 non-null   object 
 6   resolution_status            2943 non-null   object 
 7   resolution_time_hours        2701 non-null   float64
 8   customer_satisfaction_score  2659 non-null   float64
 9   notes                        2939 non-null   object 
dtypes: float64(2), object(8)
memory usage: 234.5+ KB


In [93]:
ticket.isnull().sum()

ticket_id                        0
customer_id                      0
issue_category                  71
priority                        60
submission_date                  0
resolution_date                244
resolution_status               57
resolution_time_hours          299
customer_satisfaction_score    341
notes                           61
dtype: int64

In [94]:
ticket.duplicated().sum()

np.int64(0)

In [95]:
ticket.shape

(3000, 10)

In [96]:
ticket['submission_date'] = pd.to_datetime(ticket['submission_date'],errors='coerce')
ticket['resolution_date'] = pd.to_datetime(ticket['resolution_date'],errors='coerce')

In [97]:
ticket.drop(columns=['notes'],inplace=True)

In [98]:
ticket['issue_category'] = ticket['issue_category'].fillna('Unknown')
ticket['priority'] = ticket['priority'].fillna('Unknown')

In [99]:
# If resolution_date exists → status must be resolved
ticket.loc[
    ticket['resolution_status'].isna() & ticket['resolution_date'].notna(),
    'resolution_status'
] = 'resolved'

# If resolution_date is null → status must be open
ticket.loc[
    ticket['resolution_status'].isna() & ticket['resolution_date'].isna(),
    'resolution_status'
] = 'open'

In [100]:
ticket.loc[
    ticket['resolution_time_hours'].isna(),
    'resolution_time_hours'
] = (
    (ticket['resolution_date'] - ticket['submission_date'])
    .dt.total_seconds() / 3600
)

In [101]:
avg_score = ticket['customer_satisfaction_score'].mean()
ticket['customer_satisfaction_score'] = ticket['customer_satisfaction_score'].fillna(round(avg_score))

In [102]:
ticket['ticket_year'] = ticket['submission_date'].dt.year
ticket['ticket_month'] = ticket['submission_date'].dt.month
ticket['ticket_day'] = ticket['submission_date'].dt.day

In [103]:
ticket['resolution_speed'] = pd.cut(
    ticket['resolution_time_hours'],
    bins=[0, 24, 72, 168, float('inf')],
    labels=['within_1_day', '1_3_days', '3_7_days', 'more_than_7_days']
)

In [104]:
ticket['isResolved'] = ticket['resolution_date'].notna().astype(int)

In [105]:
ticket['high_priority_flag'] = (ticket['priority'] == 'high').astype(int)
ticket['sla_breach'] = (ticket['resolution_time_hours'] > 72).astype(int)

## Proceding with review table

In [106]:
review.sample()

Unnamed: 0,review_id,customer_id,product_name,product_category,full_name,transaction_date,review_date,rating,review_title,review_text
361,rev_000362,e2d05bf5-0103-4196-bd67-d3dcaf0adc37,iPhone 13,Smartphones,Olivia Salazar,1/17/2025,1/19/2025,4,"Solid Upgrade, Just a Few Nitpicks","The iPhone 13 is an impressive device, boastin..."


In [107]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         1000 non-null   object
 1   customer_id       1000 non-null   object
 2   product_name      976 non-null    object
 3   product_category  981 non-null    object
 4   full_name         981 non-null    object
 5   transaction_date  1000 non-null   object
 6   review_date       1000 non-null   object
 7   rating            1000 non-null   int64 
 8   review_title      1000 non-null   object
 9   review_text       1000 non-null   object
dtypes: int64(1), object(9)
memory usage: 78.3+ KB


In [108]:
review.isnull().sum()

review_id            0
customer_id          0
product_name        24
product_category    19
full_name           19
transaction_date     0
review_date          0
rating               0
review_title         0
review_text          0
dtype: int64

In [109]:
review.shape

(1000, 10)

In [110]:
review.duplicated().sum()

np.int64(0)

In [111]:
review['transaction_date'] = pd.to_datetime(review['transaction_date'],errors='coerce')
review['review_date'] = pd.to_datetime(review['review_date'],errors='coerce')

In [112]:
review = review.merge(
    customer[['customer_id','full_name']],
    how='left',
    on='customer_id',
    suffixes=('','_cust')
)

review['full_name'] = review['full_name'].fillna(review['full_name_cust'])
review['full_name'] = review['full_name'].fillna('Unknown')

review.drop(columns=['full_name_cust'],inplace=True)

In [113]:
review['product_name'] = review['product_name'].fillna(
    review['product_name'].mode()[0]
)

review['product_category'] = review['product_category'].fillna(
    review['product_category'].mode()[0]
)


In [114]:
review['review_year'] = review['review_date'].dt.year
review['review_month'] = review['review_date'].dt.month
review['days_after_transaction'] = (review['review_date'] - review['transaction_date']).dt.days

In [115]:
review['sentiment'] = pd.cut(
    review['rating'],
    bins=[0,2,3,5],
    labels=['Negative','Neutral','Positive']
)

## Proceed with campaign table

In [116]:
campaign.sample()

Unnamed: 0,campaign_id,campaign_name,campaign_type,start_date,end_date,target_segment,budget,impressions,clicks,conversions,conversion_rate,roi
89,3398c829-9f8a-4aef-b771-41e37228f98f,Limited Time Event 2023,Social Media,9/26/2023,10/26/2023,Southern States,15025.04,677445.0,12772.0,491.0,3.84,822.33


In [117]:
campaign.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_id      200 non-null    object 
 1   campaign_name    194 non-null    object 
 2   campaign_type    194 non-null    object 
 3   start_date       200 non-null    object 
 4   end_date         200 non-null    object 
 5   target_segment   200 non-null    object 
 6   budget           196 non-null    float64
 7   impressions      196 non-null    float64
 8   clicks           190 non-null    float64
 9   conversions      193 non-null    float64
 10  conversion_rate  198 non-null    float64
 11  roi              197 non-null    float64
dtypes: float64(6), object(6)
memory usage: 18.9+ KB


In [118]:
campaign['start_date'] = pd.to_datetime(campaign['start_date'],errors='coerce')
campaign['end_date'] = pd.to_datetime(campaign['end_date'],errors='coerce')

In [119]:
campaign.isnull().sum()

campaign_id         0
campaign_name       6
campaign_type       6
start_date          0
end_date            0
target_segment      0
budget              4
impressions         4
clicks             10
conversions         7
conversion_rate     2
roi                 3
dtype: int64

In [120]:
campaign.shape

(200, 12)

In [121]:
campaign['campaign_name'] = campaign['campaign_name'].fillna('Unknown')
campaign['campaign_type'] = campaign['campaign_type'].fillna('Unknown')

In [122]:
campaign['budget'] = campaign['budget'].fillna(0)
campaign['impressions'] = campaign['impressions'].fillna(0)
campaign['clicks'] = campaign[['clicks', 'impressions']].min(axis=1)
campaign['conversions'] = campaign[['conversions', 'clicks']].min(axis=1)


In [123]:
campaign['conversion_rate'] = (
    campaign['conversions'] / campaign['clicks']
).replace([float('inf'), -float('inf')], 0).fillna(0)

campaign['roi'] = (
    campaign['conversions'] / campaign['budget']
).replace([float('inf'), -float('inf')], 0).fillna(0)

In [124]:
campaign['campaign_duration_days'] = ( campaign['end_date'] - campaign['start_date'] ).dt.days

In [125]:
campaign['start_year'] = campaign['start_date'].dt.year
campaign['start_month'] = campaign['start_date'].dt.month

In [126]:
campaign['cost_per_click'] = (
    campaign['budget'] / campaign['clicks']
).replace([float('inf'), -float('inf')], 0).fillna(0)

campaign['cost_per_conversion'] = (
    campaign['budget'] / campaign['conversions']
).replace([float('inf'), -float('inf')], 0).fillna(0)

## Proceed with transaction Table

In [127]:
transaction.sample()

Unnamed: 0,transaction_id,customer_id,product_name,product_category,quantity,price,transaction_date,store_location,payment_method,discount_applied
1014,c7bdbfdb-0e04-4a72-a4fa-540aaef86531,af4aa575-38bd-4362-8c22-1c66b5f820a3,Vizio SmartCast TV,TVs,2.0,1313.97,8/22/2024,Online,Credit Card,0.0


In [128]:
transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32295 entries, 0 to 32294
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transaction_id    32295 non-null  object 
 1   customer_id       32295 non-null  object 
 2   product_name      31617 non-null  object 
 3   product_category  31609 non-null  object 
 4   quantity          31651 non-null  float64
 5   price             31673 non-null  float64
 6   transaction_date  32295 non-null  object 
 7   store_location    31651 non-null  object 
 8   payment_method    31635 non-null  object 
 9   discount_applied  31684 non-null  float64
dtypes: float64(3), object(7)
memory usage: 2.5+ MB


In [129]:
transaction.isnull().sum()

transaction_id        0
customer_id           0
product_name        678
product_category    686
quantity            644
price               622
transaction_date      0
store_location      644
payment_method      660
discount_applied    611
dtype: int64

In [130]:
transaction.shape

(32295, 10)

In [131]:
transaction['transaction_date'] = pd.to_datetime(transaction['transaction_date'],errors='coerce')

In [132]:
transaction['product_name'] = transaction['product_name'].fillna(
    transaction['product_name'].mode()[0]
)

transaction['product_category'] = transaction['product_category'].fillna(
    transaction['product_category'].mode()[0]
)

In [133]:
transaction['quantity'] = transaction['quantity'].fillna(
    transaction.groupby('product_name')['quantity'].transform('median')
)

transaction['quantity'] = transaction['quantity'].fillna(
    transaction.groupby('product_category')['quantity'].transform('median')
)

transaction['quantity'] = transaction['quantity'].fillna(
    transaction['quantity'].median())


In [134]:
transaction['price'] = transaction['price'].fillna(
    transaction.groupby('product_name')['price'].transform('median')
)

transaction['price'] = transaction['price'].fillna(
    transaction.groupby('product_category')['price'].transform('median')
)

transaction['price'] = transaction['price'].fillna(
    transaction['price'].median()
)

In [135]:
transaction['store_location'] = transaction['store_location'].fillna(
    transaction['store_location'].mode()[0]
)

transaction['payment_method'] = transaction['payment_method'].fillna(
    transaction['payment_method'].mode()[0])

In [136]:
transaction['discount_applied'] = transaction['discount_applied'].fillna(
    transaction.groupby('product_category')['discount_applied'].transform('median')
)
transaction['discount_applied'] = transaction['discount_applied'].fillna(
    transaction['discount_applied'].median()
)

In [137]:
transaction['txn_year'] = transaction['transaction_date'].dt.year
transaction['txn_month'] = transaction['transaction_date'].dt.month
transaction['txn_weekday'] = transaction['transaction_date'].dt.day_name()

In [138]:
transaction['gross_amount'] = transaction['price']*transaction['quantity']
transaction['final_amount'] = transaction['gross_amount'] - (transaction['gross_amount']*transaction['discount_applied']/100)

### Add some more features to customer table based on transaction table

In [139]:
customer_spend = (
    transaction.groupby('customer_id')['final_amount']
    .sum()
    .reset_index(name='total_spent')
)

customer = customer.merge(customer_spend, on='customer_id', how='left')
customer['total_spent'] = customer['total_spent'].fillna(0)

In [140]:
customer['spending_category'] = pd.qcut(
    customer['total_spent'],
    q=3,
    labels=['Low','Medium','High']
)

In [141]:
customer_txn_count = (
    transaction.groupby('customer_id')['transaction_id']
    .count()
    .reset_index(name='total_transactions')
)
customer = customer.merge(customer_txn_count, on='customer_id', how='left')
customer['total_transactions'] = customer['total_transactions'].fillna(0)


## Proceed with Interaction table

In [142]:
interaction.sample()

Unnamed: 0,interaction_id,customer_id,channel,interaction_type,interaction_date,duration,page_or_product,session_id
13408,3bb8d2a4-d296-4f2f-b2bf-d93da326df2b,25b3bda2-6634-4e00-a856-1e4170c4b432,mobile app,wishlist_add,2/25/2025 21:34,7.0,MacBook Pro,25b3bda2-6634-4e00-a856-1e4170c4b432_session_20


In [143]:
interaction.shape

(100000, 8)

In [144]:
interaction.isnull().sum()

interaction_id         0
customer_id            0
channel             2002
interaction_type    2022
interaction_date       0
duration            1963
page_or_product     1927
session_id             0
dtype: int64

In [145]:
interaction.duplicated().sum()

np.int64(0)

In [146]:
interaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   interaction_id    100000 non-null  object 
 1   customer_id       100000 non-null  object 
 2   channel           97998 non-null   object 
 3   interaction_type  97978 non-null   object 
 4   interaction_date  100000 non-null  object 
 5   duration          98037 non-null   float64
 6   page_or_product   98073 non-null   object 
 7   session_id        100000 non-null  object 
dtypes: float64(1), object(7)
memory usage: 6.1+ MB


In [147]:
interaction['interaction_date'] = pd.to_datetime(interaction['interaction_date'],errors='coerce')

In [148]:
interaction['channel'] = interaction['channel'].fillna(
    interaction['channel'].mode()[0]
)
interaction['interaction_type'] = interaction['interaction_type'].fillna(
    interaction['interaction_type'].mode()[0]
)
interaction['duration'] = interaction['duration'].fillna(
    interaction.groupby('interaction_type')['duration'].transform('median')
)
interaction['page_or_product'] = interaction['page_or_product'].fillna(
    interaction['page_or_product'].mode()[0]
)

In [149]:
interaction['int_year'] = interaction['interaction_date'].dt.year
interaction['int_month'] = interaction['interaction_date'].dt.month
interaction['int_weekday'] = interaction['interaction_date'].dt.day_name()
interaction['int_time'] = interaction['interaction_date'].dt.time

In [150]:
interaction['time_period'] = pd.cut(
    interaction['interaction_date'].dt.hour,
    bins=[0,6,12,18,24],
    labels=['Night','Morning','Afternoon','Evening'],
    right=False
)

### Add some more features to customer table based on transaction table

In [151]:
fav_page = (
    interaction.groupby(['customer_id','page_or_product'])
    .size()
    .reset_index(name='count')
    .sort_values(['customer_id','count'], ascending=[True, False])
    .drop_duplicates('customer_id')
    [['customer_id','page_or_product']]
)

customer = customer.merge(fav_page, on='customer_id', how='left')
customer.rename(columns={'page_or_product':'favorite_page_or_product'}, inplace=True)

customer['favorite_page_or_product'] = customer['favorite_page_or_product'].fillna('Unknown')

### Extracting cleaned tables

In [152]:
customer.to_csv('data/cleaned/customer_clean.csv', index=False)
ticket.to_csv('data/cleaned/ticket_clean.csv', index=False)
campaign.to_csv('data/cleaned/campaign_clean.csv', index=False)
transaction.to_csv('data/cleaned/transaction_clean.csv', index=False)
review.to_csv('data/cleaned/review_clean.csv', index=False)
interaction.to_csv('data/cleaned/interaction_clean.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'data\cleaned'