In [1]:
import pandas as pd

# Load the cleaned datasets
sales_df = pd.read_csv('cleaned_Sales.csv')
customers_df = pd.read_csv('cleaned_Customers.csv')

# Merge Sales and Customers datasets
merged_sales_customers = pd.merge(sales_df, customers_df, on='CustomerKey', how='inner')

# Save the merged dataset
merged_sales_customers.to_csv('merged_Sales_Customers.csv', index=False)

# Display the first few rows of the merged dataset
print(merged_sales_customers.head())


   Order Number  Line Item Order Date Delivery Date  CustomerKey  StoreKey  \
0        366001          1   1/1/2016     1/13/2016      1269051         0   
1        366001          2   1/1/2016     1/13/2016      1269051         0   
2        366002          1   1/1/2016     1/12/2016       266019         0   
3        366002          2   1/1/2016     1/12/2016       266019         0   
4        366002          3   1/1/2016     1/12/2016       266019         0   

   ProductKey  Quantity Currency Code  Gender            Name      City  \
0        1048         2           USD    Male  Frank Upchurch   Auberry   
1        2007         1           USD    Male  Frank Upchurch   Auberry   
2        1106         7           CAD  Female      Joan Roche  Red Deer   
3         373         1           CAD  Female      Joan Roche  Red Deer   
4        1080         4           CAD  Female      Joan Roche  Red Deer   

  State Code       State Zip Code        Country      Continent    Birthday  
0 

In [2]:
import pandas as pd

# Load the merged dataset
merged_sales_customers = pd.read_csv('Merged_Sales_Customers.csv')

# Check if there are any CustomerKey values in the Sales dataset not present in the Customers dataset
missing_customer_keys = merged_sales_customers[~merged_sales_customers['CustomerKey'].isin(customers_df['CustomerKey'])]

print(f"Number of missing CustomerKey matches: {len(missing_customer_keys)}")
if not missing_customer_keys.empty:
    print(missing_customer_keys.head())


Number of missing CustomerKey matches: 0


In [3]:
# Check if customer details in the merged dataset are consistent with the Customers dataset
customer_details_check = merged_sales_customers[['CustomerKey', 'Name', 'City', 'Gender']].merge(customers_df[['CustomerKey', 'Name', 'City', 'Gender']], on=['CustomerKey', 'Name', 'City', 'Gender'], how='left', indicator=True)

# Display rows where there is a mismatch
print(customer_details_check[customer_details_check['_merge'] == 'left_only'])


Empty DataFrame
Columns: [CustomerKey, Name, City, Gender, _merge]
Index: []


In [4]:
import pandas as pd

# Load the previously merged Sales and Customers dataset
merged_sales_customers_df = pd.read_csv('Merged_Sales_Customers.csv', encoding='ISO-8859-1')

# Load the Stores dataset
stores_df = pd.read_csv('cleaned_Stores.csv', encoding='ISO-8859-1')

# Merge the Sales_Customers and Stores datasets
merged_sales_customers_stores = pd.merge(merged_sales_customers_df, stores_df, on='StoreKey', how='inner')

# Display the first few rows of the merged dataset
print(merged_sales_customers_stores.head())

# Save the merged dataset to a CSV file for further use
merged_sales_customers_stores.to_csv('Merged_Sales_Customers_Stores.csv', index=False)


   Order Number  Line Item Order Date Delivery Date  CustomerKey  StoreKey  \
0        366001          1   1/1/2016     1/13/2016      1269051         0   
1        366001          2   1/1/2016     1/13/2016      1269051         0   
2        366002          1   1/1/2016     1/12/2016       266019         0   
3        366002          2   1/1/2016     1/12/2016       266019         0   
4        366002          3   1/1/2016     1/12/2016       266019         0   

   ProductKey  Quantity Currency Code  Gender  ... State Code     State_x  \
0        1048         2           USD    Male  ...         CA  California   
1        2007         1           USD    Male  ...         CA  California   
2        1106         7           CAD  Female  ...         AB     Alberta   
3         373         1           CAD  Female  ...         AB     Alberta   
4        1080         4           CAD  Female  ...         AB     Alberta   

  Zip Code      Country_x      Continent    Birthday Country_y State

In [5]:
# Check for duplicate rows in the merged dataset
duplicates = merged_sales_customers_stores.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Check for any missing values in the merged dataset
missing_values = merged_sales_customers_stores.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Ensure there are no unexpected columns
expected_columns = ['Order Number', 'Line Item', 'Order Date', 'Delivery Date', 'CustomerKey', 'StoreKey', 
                     'ProductKey', 'Quantity', 'Currency Code', 'Gender', 'Name', 'City', 'State Code',
                     'State_x', 'Zip Code', 'Country_x', 'Continent', 'Birthday', 'Country_y', 'State_y',
                     'Square Meters', 'Open Date']

actual_columns = merged_sales_customers_stores.columns.tolist()
missing_columns = [col for col in expected_columns if col not in actual_columns]
print(f"Missing columns: {missing_columns}")

# Display the first few rows of the dataset to inspect
print(merged_sales_customers_stores.head())


Number of duplicate rows: 0
Missing values in each column:
Order Number     0
Line Item        0
Order Date       0
Delivery Date    0
CustomerKey      0
StoreKey         0
ProductKey       0
Quantity         0
Currency Code    0
Gender           0
Name             0
City             0
State Code       0
State_x          0
Zip Code         0
Country_x        0
Continent        0
Birthday         0
Country_y        0
State_y          0
Square Meters    0
Open Date        0
dtype: int64
Missing columns: []
   Order Number  Line Item Order Date Delivery Date  CustomerKey  StoreKey  \
0        366001          1   1/1/2016     1/13/2016      1269051         0   
1        366001          2   1/1/2016     1/13/2016      1269051         0   
2        366002          1   1/1/2016     1/12/2016       266019         0   
3        366002          2   1/1/2016     1/12/2016       266019         0   
4        366002          3   1/1/2016     1/12/2016       266019         0   

   ProductKey  Quanti

In [6]:
import pandas as pd

# Load the datasets
merged_sales_customers_stores = pd.read_csv('Merged_Sales_Customers_Stores.csv', encoding='ISO-8859-1')
products_df = pd.read_csv('cleaned_Products.csv', encoding='ISO-8859-1')

# Merge Sales_Customers_Stores with Products datasets
merged_all = pd.merge(merged_sales_customers_stores, products_df, on='ProductKey', how='inner')

# Display the first few rows of the merged dataset
print(merged_all.head())

# Save the merged dataset to a CSV file for further use
merged_all.to_csv('Merged_All.csv', index=False)


   Order Number  Line Item Order Date Delivery Date  CustomerKey  StoreKey  \
0        366001          1   1/1/2016     1/13/2016      1269051         0   
1        366001          2   1/1/2016     1/13/2016      1269051         0   
2        366002          1   1/1/2016     1/12/2016       266019         0   
3        366002          2   1/1/2016     1/12/2016       266019         0   
4        366002          3   1/1/2016     1/12/2016       266019         0   

   ProductKey  Quantity Currency Code  Gender  ...   Open Date  \
0        1048         2           USD    Male  ...  2010-01-01   
1        2007         1           USD    Male  ...  2010-01-01   
2        1106         7           CAD  Female  ...  2010-01-01   
3         373         1           CAD  Female  ...  2010-01-01   
4        1080         4           CAD  Female  ...  2010-01-01   

                             Product Name            Brand   Color  \
0         A. Datum SLR Camera X136 Silver         A. Datum  Silv

In [7]:
import pandas as pd

# Load the merged dataset
merged_all_df = pd.read_csv('Merged_All.csv', encoding='ISO-8859-1')

# Check for any missing values in the dataset
missing_values = merged_all_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Check for duplicate rows
duplicate_rows = merged_all_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# Check if all expected columns are present
expected_columns = ['Order Number', 'Line Item', 'Order Date', 'Delivery Date', 'CustomerKey', 'StoreKey', 'ProductKey', 'Quantity', 'Currency Code', 'Gender', 'Name', 'City', 'State Code', 'State_x', 'Zip Code', 'Country_x', 'Continent', 'Birthday', 'Country_y', 'State_y', 'Square Meters', 'Open Date', 'Product Name', 'Brand', 'Color', 'Unit Cost USD', 'Unit Price USD', 'SubcategoryKey', 'Subcategory', 'CategoryKey', 'Category']
missing_columns = [col for col in expected_columns if col not in merged_all_df.columns]
print("Missing columns:")
print(missing_columns)


Missing values in each column:
Order Number      0
Line Item         0
Order Date        0
Delivery Date     0
CustomerKey       0
StoreKey          0
ProductKey        0
Quantity          0
Currency Code     0
Gender            0
Name              0
City              0
State Code        0
State_x           0
Zip Code          0
Country_x         0
Continent         0
Birthday          0
Country_y         0
State_y           0
Square Meters     0
Open Date         0
Product Name      0
Brand             0
Color             0
Unit Cost USD     0
Unit Price USD    0
SubcategoryKey    0
Subcategory       0
CategoryKey       0
Category          0
dtype: int64
Number of duplicate rows: 0
Missing columns:
[]
