## Part 1

In [1]:
# Step 3: Import necessary libraries
import pandas as pd
import numpy as np
import os
import matplotlib as plt
import seaborn as sns

# Define the file path for the customer data
customer_data_path = r'C:\Users\Jose Zambom\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Python Fundamentals for Data Analyst\csv file\4.9_Intro to data visualization with Python\customers.csv'

# Load the customer data into a pandas DataFrame
df_customers = pd.read_csv(customer_data_path)

#Check the first few rows of the customer data
print(df_customers.head())

   user_id First Name    Surnam  Gender       STATE  Age date_joined  \
0    26711    Deborah  Esquivel  Female    Missouri   48    1/1/2017   
1    33890   Patricia      Hart  Female  New Mexico   36    1/1/2017   
2    65803    Kenneth    Farley    Male       Idaho   35    1/1/2017   
3   125935   Michelle     Hicks  Female        Iowa   40    1/1/2017   
4   130797        Ann   Gilmore  Female    Maryland   26    1/1/2017   

   n_dependants fam_status  income  
0             3    married  165665  
1             0     single   59285  
2             2    married   99568  
3             0     single   42049  
4             1    married   40374  


In [2]:
# Step 4: Renaming columns to follow consistent naming conventions

df_customers.rename(columns={
    'First Name' : 'first_name',
    'Surnam' : 'surname',
    'Gender' : 'gender',
    'STATE' : 'state',
    'Age' : 'age',
    'data_joined' : 'date_joined',
    'n_dependants' : 'num_dependants',
    'fam_status' : 'family_status',
    'income' : 'income'
}, inplace = True)

print(df_customers.head())

   user_id first_name   surname  gender       state  age date_joined  \
0    26711    Deborah  Esquivel  Female    Missouri   48    1/1/2017   
1    33890   Patricia      Hart  Female  New Mexico   36    1/1/2017   
2    65803    Kenneth    Farley    Male       Idaho   35    1/1/2017   
3   125935   Michelle     Hicks  Female        Iowa   40    1/1/2017   
4   130797        Ann   Gilmore  Female    Maryland   26    1/1/2017   

   num_dependants family_status  income  
0               3       married  165665  
1               0        single   59285  
2               2       married   99568  
3               0        single   42049  
4               1       married   40374  


In [3]:
# Checkin for Missing Values
missing_values = df_customers.isnull().sum()
print("Missing Values in each column:\n", missing_values)

Missing Values in each column:
 user_id               0
first_name        11259
surname               0
gender                0
state                 0
age                   0
date_joined           0
num_dependants        0
family_status         0
income                0
dtype: int64


In [4]:
# Drop the first_name column since it has a lot of missing values and may not be needed
df_customers.drop(columns=['first_name', 'surname'], inplace = True)

print(df_customers.columns)

Index(['user_id', 'gender', 'state', 'age', 'date_joined', 'num_dependants',
       'family_status', 'income'],
      dtype='object')


In [5]:
# checking for duplicates
duplicate_rows = df_customers[df_customers.duplicated()]
# Display duplicate rows
print("Duplicate rows found:\n", duplicate_rows)

Duplicate rows found:
 Empty DataFrame
Columns: [user_id, gender, state, age, date_joined, num_dependants, family_status, income]
Index: []


In [8]:
# Check for mixed-type data in the DataFrame
for col in df_customers.columns:
    # Check if the column contains more than one unique data type
    if df_customers[col].apply(type).nunique() > 1:
        print(f"Mixed types found in column '{col}'")
    else:
        print(f"No mixed types in column '{col}'")

No mixed types in column 'user_id'
No mixed types in column 'gender'
No mixed types in column 'state'
No mixed types in column 'age'
No mixed types in column 'date_joined'
No mixed types in column 'num_dependants'
No mixed types in column 'family_status'
No mixed types in column 'income'


In [11]:
# Define the file path for the orders_products_with_flags.pkl file
orders_products_path = r'C:\Users\Jose Zambom\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Python Fundamentals for Data Analyst\csv file\Prepared Data\orders_products_with_flags.pkl'

# Load the pickle file into a DataFrame
orders_products_with_flags = pd.read_pickle(orders_products_path)

# Check the first few rows to confirm the data has been loaded correctly
print(orders_products_with_flags.head())

   order_id  user_id eval_set  order_number  orders_day_of_week  \
0         2   202279    prior             3                   5   
1         2   202279    prior             3                   5   
2         2   202279    prior             3                   5   
3         2   202279    prior             3                   5   
4         2   202279    prior             3                   5   

   order_hour_of_day  days_since_prior_order  product_id  add_to_cart_order  \
0                  9                     8.0     33120.0                1.0   
1                  9                     8.0     28985.0                2.0   
2                  9                     8.0      9327.0                3.0   
3                  9                     8.0     45918.0                4.0   
4                  9                     8.0     30035.0                5.0   

   reordered  ...        price_range    price_range_loc  busiest_days  \
0        1.0  ...  Mid-range product  Mid-range p

In [12]:
print(df_customers.head())

   user_id  gender       state  age date_joined  num_dependants family_status  \
0    26711  Female    Missouri   48    1/1/2017               3       married   
1    33890  Female  New Mexico   36    1/1/2017               0        single   
2    65803    Male       Idaho   35    1/1/2017               2       married   
3   125935  Female        Iowa   40    1/1/2017               0        single   
4   130797  Female    Maryland   26    1/1/2017               1       married   

   income  
0  165665  
1   59285  
2   99568  
3   42049  
4   40374  


In [14]:
# Step 6
# Ensure 'user_id' is the same data type in both DataFrames
df_customers['user_id'] = df_customers['user_id'].astype(int)
orders_products_with_flags['user_id'] = orders_products_with_flags['user_id'].astype(int)

# Merge the customer data with the orders_products_with_flags data on 'user_id'
combined_data = pd.merge(orders_products_with_flags, df_customers, on='user_id', how='inner')

# Check the first few rows of the combined DataFrame
print(combined_data.head())


   order_id  user_id eval_set  order_number  orders_day_of_week  \
0         2   202279    prior             3                   5   
1         2   202279    prior             3                   5   
2         2   202279    prior             3                   5   
3         2   202279    prior             3                   5   
4         2   202279    prior             3                   5   

   order_hour_of_day  days_since_prior_order  product_id  add_to_cart_order  \
0                  9                     8.0     33120.0                1.0   
1                  9                     8.0     28985.0                2.0   
2                  9                     8.0      9327.0                3.0   
3                  9                     8.0     45918.0                4.0   
4                  9                     8.0     30035.0                5.0   

   reordered  ... spending_flag  median_days_since_prior_order  \
0        1.0  ...   Low spender                         

In [15]:
# Step 8: Export dataframe as pickle file
# Define the path to save the combined pickle file
combined_data_path = r'C:\Users\Jose Zambom\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Python Fundamentals for Data Analyst\csv file\Prepared Data\combined_data.pkl'

# Export the combined DataFrame as a pickle file
combined_data.to_pickle(combined_data_path)

# Confirm the export
print(f"Data successfully exported to {combined_data_path}")

Data successfully exported to C:\Users\Jose Zambom\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Python Fundamentals for Data Analyst\csv file\Prepared Data\combined_data.pkl
