### Setup
----
##### !! NOTE: this is for the creaton of the master file for project assignment 3 - sok1005!!
To run this code you need the data from [chicagobooth](https://www.chicagobooth.edu/research/kilts/datasets/dominicks), specifically the data for shampoo.

The filles needed is:

**Customoer Count File** - (ccount(stata).zip)

**Store-Level Demographics File** - (demo(stata).zip)

And from the **category file** you need to find **shampoo**, and download **UPC.csv File** and **Movement.csv File**

And all of this needs to be saved in a folder called **"data"** in the same folder you are running this script from, running the **"create_folders" function** will automatically create the folder

In [80]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import requests
import os
import chardet
import time
import csv
import gc
import re

from datetime import datetime, timedelta
from functools import reduce

import seaborn as sns

from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.ticker as mticker
import matplotlib.pylab as pylab

import sympy as sp
from sympy.solvers import solve

from bs4 import BeautifulSoup
from collections import Counter

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from cycler import cycler

# custome plot style
params  = {
"lines.linewidth": 1.5,

"legend.fancybox": "true",

"axes.prop_cycle": cycler('color', ["#ffa822","#1ac0c6","#ff6150","#30B66A","#B06AFF","#FF21E1"]),
"axes.facecolor": "#2b2b2b",
"axes.axisbelow": "true",
"axes.grid": "true",
"axes.edgecolor": "#2b2b2b",
"axes.linewidth": 0.5,
"axes.labelpad": 0,

"patch.edgecolor": "#2b2b2b",
"patch.linewidth": 0.5,

"grid.linestyle": "--",
"grid.linewidth": 0.5,
"grid.color": "#b8aba7",

"xtick.major.size": 0,
"xtick.minor.size": 0,
"ytick.major.size": 0,
"ytick.minor.size": 0,

"font.family":"monospace",
"font.size":10.0,
"text.color": "#FFE9E3",
"axes.labelcolor": "#b8aba7",
"xtick.color": "#b8aba7",
"ytick.color": "#b8aba7",

"savefig.edgecolor": "#2b2b2b",
"savefig.facecolor": "#2b2b2b",

"figure.subplot.left": 0.08,
"figure.subplot.right": 0.95,
"figure.subplot.bottom": 0.09,
"figure.facecolor": "#2b2b2b"}

pylab.rcParams.update(params)
print("finish")

finish


In [81]:
# Collect files from folder, if file type equals file_type
def get_files(folder, file_type):
    file_paths = []
    for file in os.listdir(folder):
        if file.endswith(file_type):
            file_paths.append([os.path.join(folder, file), file_type])
    return file_paths

def get_encoder(file_path, chunksize = 10_000):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(chunksize))
    return result['encoding']

# NOTE TO SELF, STRAGE ERROR HERE, MIGHT BE A DUPLICATE ERROR
# Load, filter in chunks and Convert to csv
def load_and_filter_file(input_file, temp_path, filter_func:list, file_type=".csv", chunksize=10_000, new_file_name = ""):
    temp_file = os.path.join(temp_path,new_file_name)
    if new_file_name == "":
        # Extract the file name from the input_file path
        input_file_name = os.path.basename(input_file)

        # Create a temp_file path by combining temp_path and input_file_name
        file_name_without_ext, file_ext = os.path.splitext(input_file_name)
        temp_file = os.path.join(temp_path, f"{file_name_without_ext}_temp.csv")
        
    # Had to fix the decoding because 'invalid continuation byte' that utf-8 can't decode. And manual attempt to fix it did not reveal byte 0xd5
    encodings = ["utf-8", "ISO-8859-1", "cp1252", "latin1"]
    success = False

    for encoding in encodings:
        try:
            if file_type.lower() == '.csv':
                reader = pd.read_csv(input_file, chunksize=chunksize, encoding=encoding)
            elif file_type.lower() == '.dta':
                reader = pd.read_stata(input_file, chunksize=chunksize)
            else:
                raise ValueError("Unsupported file type. Supported types are 'csv' and 'dta'.")

            for i, chunk in enumerate(reader):
                filtered_chunk = chunk
                for func in filter_func:
                    filtered_chunk = func(filtered_chunk)
                if i == 0:
                    filtered_chunk.to_csv(temp_file, index=False, mode='w')
                else:
                    filtered_chunk.to_csv(temp_file, index=False, mode='a', header=False)

            success = True
            print(f"Succes with the encoding '{encoding}', file {temp_file} now created")
            break

        except UnicodeDecodeError:
            print(f"Failed to read the file with encoding '{encoding}', trying the next one...")

    if not success:
        raise ValueError("None of the attempted encodings were successful in reading the file.")
        
def load_and_filter_file_full_load(input_file, temp_path, filter_func:list, file_type=".csv", new_file_name = ""):
    temp_file = os.path.join(temp_path,new_file_name)
    if new_file_name == "":
        # Extract the file name from the input_file path
        input_file_name = os.path.basename(input_file)

        # Create a temp_file path by combining temp_path and input_file_name
        file_name_without_ext, file_ext = os.path.splitext(input_file_name)
        temp_file = os.path.join(temp_path, f"{file_name_without_ext}_temp.csv")
        
    # Had to fix the decoding because 'invalid continuation byte' that utf-8 can't decode. And manual attempt to fix it did not reveal byte 0xd5
    encodings = ["utf-8", "ISO-8859-1", "cp1252", "latin1"]
    success = False

    for encoding in encodings:
        try:
            if file_type.lower() == '.csv':
                df = pd.read_csv(input_file, encoding=encoding)
            elif file_type.lower() == '.dta':
                df = pd.read_stata(input_file)
            else:
                raise ValueError("Unsupported file type. Supported types are 'csv' and 'dta'.")

            for func in filter_func:
                df = func(df)
                
            df.to_csv(temp_file, index=False)

            success = True
            print(f"Succes with the encoding '{encoding}', file {temp_file} now created")
            break

        except UnicodeDecodeError:
            print(f"Failed to read the file with encoding '{encoding}', trying the next one...")

    if not success:
        raise ValueError("None of the attempted encodings were successful in reading the file.")

            
# Merge csv files
def merge_csv_files(file1, file2, output_file, merge_on= None, merge_dtype=None, chunksize =10000):
    if (merge_on is not None) and (type(merge_on) != list):  # Fix the condition here
        merge_on = [merge_on]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
        writer = None
        
        # Read the entire file2 into memory
        file2_data = pd.read_csv(file2)
        
        if merge_dtype is not None:
            for column in merge_on:
                file2_data[column] = file2_data[column].astype(merge_dtype)
        
        for chunk1 in pd.read_csv(file1, chunksize=chunksize):
            if merge_dtype is not None:
                for column in merge_on:
                    chunk1[column] = chunk1[column].astype(merge_dtype)

            merged_chunk = pd.merge(chunk1, file2_data, on=merge_on) if merge_on else pd.concat([chunk1, file2_data], axis=1)

            if writer is None:
                writer = csv.DictWriter(f_out, fieldnames=merged_chunk.columns)
                writer.writeheader()

            for row in merged_chunk.to_dict(orient='records'):
                writer.writerow(row)
                

                    
# Run time test function
def time_function(func, *args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"{func.__name__} took {elapsed_time:.2f} seconds to run.")
    return result

# Folder check and creation 
def create_folders(folder_paths):
    for folder_path in folder_paths:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            print(f"Folder created: {folder_path}")
        else:
            print(f"Folder already exists: {folder_path}")

In [82]:
# List of filters
def empty_filter_func(chunk):
    # Empty filter for test
    return chunk

def filter_empty_to_0_and_dropna(chunk):
    # Replace empty values with 0 if the column is numeric, otherwise with NaN
    for col in chunk.columns:
        if pd.api.types.is_numeric_dtype(chunk[col]):
            chunk[col] = chunk[col].replace('', 0).fillna(0)
        else:
            chunk[col] = chunk[col].replace('', np.nan)
    
    # Drop rows with NaN values
    filtered_chunk = chunk.dropna()
    
    return filtered_chunk

def filter_remove_empty_and_nan(chunk):
    filtered_chunk = chunk.replace('', np.nan)
    # Remove rows with NaN values from the DataFrame
    filtered_chunk = filtered_chunk.dropna()
    return filtered_chunk

def filter_header_up(chunk):
    # Turn header to upper
    chunk.columns = map(str.upper, chunk.columns)
    return chunk

def filter_week(chunk):
    # The data begins from week 128 (02/20/92). To filter for the year 1993, we select week 173 to 224. Based on Dominicks Manual and Codebook - Part 8: Week’s Decode Table
    # 173 = 12/31/92 to 01/06/93
    # 124 = 12/23/93 to 12/29/93
    start_week = 173
    end_week = 224

    # Create a copy of the chunk to avoid the warning
    chunk_copy = chunk.copy()

    # Modify the 'WEEK' column in the copied chunk
    chunk_copy['WEEK'] = chunk_copy['WEEK'].astype(int)

    filtered_chunk = chunk_copy[(chunk_copy['WEEK'] >= start_week) & (chunk_copy['WEEK'] <= end_week)]
    return filtered_chunk

def filter_compute_revenue(chunk):
    # Compute revenue "PRICE" * "MOVE" and store it in a new column "REVENUE"
    chunk['REVENUE'] = chunk['PRICE'] * chunk['MOVE']
    return chunk

def filter_move_above_one(chunk):
    filtered_chunk = chunk[(chunk['MOVE'] > 0)]
    return filtered_chunk

def filter_out_bad_data(chunk):
    filtered_chunk = chunk[(chunk['OK'] > 0)]
    return filtered_chunk

def filter_keep_columns(chunk, columns_to_keep = ["STORE", "COSMETIC", "HABA","PHARMACY","WEEK"]):
    # Keep only the specified columns in the DataFrame
    filtered_chunk = chunk[columns_to_keep]
    return filtered_chunk

def remove_column(chunk):
    column_names = ["GINI","LIFT5","RATIO5"]
    for column_name in column_names:
        if column_name in chunk.columns:
            chunk = chunk.drop(column_name, axis=1)
    return chunk

def combine_same_week_and_store(chunk):
    # Combine rows with the same value in the "WEEK" column
    combined_chunk = chunk.groupby(["WEEK","STORE"]).sum().reset_index()
    return combined_chunk

def filter_remove_symb_in_description(chunk):
    # Combine rows with the same value in the "WEEK" column
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r'&', ' ANNNND ', x))
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r'[^\w]', '', x))
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r' ANNNND ', '&', x))
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r' +', ' ', x))
    return chunk

"""
def filter_clean_text_data(chunk, column = "DESCRIP"): #Note: one varible still has double spaces (!it might not be spaces!)
    # Convert to uppercase, remove symbols, and remove extra spaces
    chunk[column] = (chunk[column].str.upper()
                                  .str.replace(r"\W+", " ", regex=True)
                                  .str.replace(r"\s{2,}", " ", regex=True)
                                  .str.strip()
                                  .str.replace(r"(?<!\S)\S(?!\S)", "", regex=True))
    return chunk
"""
def filter_clean_text_data(chunk, column = "DESCRIP"):
    # Convert to uppercase
    chunk[column] = chunk[column].str.upper()
    # Replace non breaking spaces and invisible characters
    chunk[column] = chunk[column].apply(lambda x: ''.join(c if c.isprintable() else ' ' for c in x))
    # Remove symbols
    chunk[column] = chunk[column].str.replace(r"\W+", " ", regex=True)
    # Remove single standalone characters
    chunk[column] = chunk[column].str.replace(r"\b\w\b", " ", regex=True)

    # Keep replacing double spaces overly complex vesion
    while chunk[column].str.contains('  ', regex=False).any():
        chunk[column] = chunk[column].str.replace('  ', ' ', regex=False)

    # Strip extra white space from beginning and end
    chunk[column] = chunk[column].str.strip()

    return chunk

In [83]:
folder_path = "data/" # folder with data
folder_path_temp = "data_temp/" # folder with temp data
folder_path_clean = "data_clean/" # folder with clean
chunk_size = 10_000 # chunks of data loaderd in memory

create_folders([folder_path, folder_path_temp,folder_path_clean])

file_paths = get_files(folder_path,".csv")
file_paths.extend(get_files(folder_path,".dta"))

print(file_paths)
print("\n! Remember the raw data needs to be in the data folder !")

Folder already exists: data/
Folder already exists: data_temp/
Folder already exists: data_clean/
[['data/upcsha.csv', '.csv'], ['data/wsha.csv', '.csv'], ['data/ccount.dta', '.dta'], ['data/demo.dta', '.dta']]

! Remember the raw data needs to be in the data folder !


### Clean Data
----

#### Clean upc data

In [84]:
filter_func_list = [filter_header_up,filter_remove_empty_and_nan,filter_clean_text_data,filter_remove_empty_and_nan]
time_function(load_and_filter_file, file_paths[0][0], folder_path_temp, filter_func_list, file_type=file_paths[0][1], chunksize = chunk_size)

Failed to read the file with encoding 'utf-8', trying the next one...
Succes with the encoding 'ISO-8859-1', file data_temp/upcsha_temp.csv now created
load_and_filter_file took 0.75 seconds to run.


In [85]:
# Brand Probabilety
# Load the DataFrame from a CSV file
file_path_ups_temp = "data_temp/upcsha_temp.csv"
df_upc = pd.read_csv(file_path_ups_temp)

# Split each product description into words
df_upc['words'] = df_upc['DESCRIP'].str.split()

# Get the first word as potential brand name
df_upc['potential_brand'] = df_upc['words'].apply(lambda x: x[0] if x else '')

# Initialize an empty dictionary to store final brand names
brand_names = {}

# For each unique potential brand name
for potential_brand in df_upc['potential_brand'].unique():
    # Get all product descriptions for this potential brand
    descriptions = df_upc.loc[df_upc['potential_brand'] == potential_brand, 'words']
    
    # Count the frequency of each word in these descriptions (excluding the first word)
    word_counts = Counter(word for desc in descriptions for word in desc[1:])
    
    # Get the most frequent word
    most_common_word = word_counts.most_common(1)[0][0] if word_counts else ''
    
    # Combine the potential brand name and the most common word to form the final brand name
    brand_names[potential_brand] = potential_brand + ' ' + most_common_word

# Map potential brand names to final brand names in the DataFrame
df_upc['BRAND'] = df_upc['potential_brand'].map(brand_names)

# Save the updated DataFrame back to the CSV file
df_upc.to_csv(file_path_ups_temp, index=False)

#### Clean walk data

In [86]:
filter_func_list = [filter_header_up,filter_remove_empty_and_nan,filter_week,filter_move_above_one,filter_out_bad_data,filter_compute_revenue]
time_function(load_and_filter_file, file_paths[1][0], folder_path_temp, filter_func_list, file_type=file_paths[1][1], chunksize = chunk_size)

Succes with the encoding 'utf-8', file data_temp/wsha_temp.csv now created
load_and_filter_file took 39.28 seconds to run.


#### Clean customer count data

In [87]:
filter_func_list = [filter_header_up,filter_remove_empty_and_nan,filter_week,filter_keep_columns, combine_same_week_and_store]
time_function(load_and_filter_file, file_paths[2][0], folder_path_temp, filter_func_list, file_type=file_paths[2][1], chunksize = chunk_size)

Succes with the encoding 'utf-8', file data_temp/ccount_temp.csv now created
load_and_filter_file took 4.54 seconds to run.


#### Clean demo data

In [88]:
filter_func_list = [filter_header_up, remove_column,filter_empty_to_0_and_dropna]
time_function(load_and_filter_file, file_paths[3][0], folder_path_temp, filter_func_list, file_type=file_paths[3][1], chunksize = chunk_size)

Succes with the encoding 'utf-8', file data_temp/demo_temp.csv now created
load_and_filter_file took 0.83 seconds to run.


### Merge Data
----

In [89]:
merge_file_main = "data_temp/wsha_temp.csv"
merge_file_sec = "data_temp/upcsha_temp.csv"
merge_file_out = "data_temp/wsha_upcsha.csv"
time_function(merge_csv_files,merge_file_main, merge_file_sec, merge_file_out, merge_on="UPC", merge_dtype=np.int64, chunksize=chunk_size)

merge_csv_files took 6.22 seconds to run.


In [90]:
merge_file_main = "data_temp/wsha_upcsha.csv"
merge_file_sec = "data_temp/ccount_temp.csv"
merge_file_out = "data_temp/wsha_upcsha_ccount.csv"
time_function(merge_csv_files,merge_file_main, merge_file_sec, merge_file_out, merge_on=["STORE","WEEK"], merge_dtype=np.int64, chunksize=chunk_size)

merge_csv_files took 6.53 seconds to run.


In [91]:
merge_file_main = "data_temp/wsha_upcsha_ccount.csv"
merge_file_sec = "data_temp/demo_temp.csv"
merge_file_out = "data_temp/wsha_upcsha_ccount_demo.csv"
time_function(merge_csv_files,merge_file_main, merge_file_sec, merge_file_out, merge_on="STORE", merge_dtype=np.int64, chunksize=chunk_size)

merge_csv_files took 147.62 seconds to run.


### Clean Merge Data and Select Relevant Column
----

In [92]:
selected_columns = ["STORE", "WEEK", "CITY", "ZIP", "BRAND", "MOVE", "PRICE","QTY","REVENUE","PROFIT","CASE","COSMETIC","HABA","PHARMACY","INCOME","HSIZEAVG","HSIZE1","HSIZE2",
                    "HSIZE34","HHLARGE","SINGLE","RETIRED","UNEMP","WORKWOM","WRKCH5","WRKCH17","NWRKCH5","NWRKCH17","WRKCH","NWRKCH","WRKWNCH"]

def filter_select_columns(chunk, columns_to_keep = selected_columns):
    # Keep only the specified columns in the DataFrame
    filtered_chunk = chunk[columns_to_keep]
    return filtered_chunk

def filter_upc(chunk): #!! Note: remove, not in use any more....I think....double check !!
    filtered_chunk = chunk[(chunk["UPC"].isin(upc_list[-1]))]
    return filtered_chunk

def filter_create_sales_column(chunk): #!! Note: remove, not in use any more !!
    # Create a temporary DataFrame to avoid unkown SettingWithCopyWarning
    # NOTE I still get the error, tho the code works. 
    # Error is most likly panda not knowing wheter I want to create a copy or work with the main chunk. 
    tmp = chunk['PRICE'] * chunk["MOVE"] / chunk['QTY']
    
    # Assign the temporary DataFrame to the new column 'SALES'
    chunk = chunk.assign(SALES=tmp)

    return chunk


filter_func_list = [filter_select_columns]
file_main = "data_temp/wsha_upcsha_ccount_demo.csv"
file_out = "shampoo_sale_data.csv"
time_function(load_and_filter_file, file_main, folder_path_clean, filter_func_list, chunksize = chunk_size, new_file_name = file_out)

Succes with the encoding 'utf-8', file data_clean/shampoo_sale_data.csv now created
load_and_filter_file took 15.79 seconds to run.


In [93]:
# Group by "STORE", "CITY", "ZIP", "BRAND", and "WEEK" and sum the other columns
df_sale = pd.read_csv("data_clean/shampoo_sale_data.csv")
df_sale_grouped = df_sale.groupby(["WEEK", "STORE", "CITY", "ZIP", "BRAND"]).sum().reset_index()

# Save the updated DataFrame back to the CSV file
df_sale_grouped.to_csv("data_clean/shampoo_sale_data_branded.csv", index=False)

### Find Top Brand
----

In [111]:
"""
def read_data(csv_file):
    return pd.read_csv(csv_file)

def get_top_brands(df, brand_column, move_column, profit_column, n_brands=5):
    score_column = "move and profit score"
    df[score_column] = df[move_column] + df[profit_column]
    top_brands_series = df.groupby(brand_column)[score_column].sum().nlargest(n_brands)
    top_brands = top_brands_series.index.tolist()
    print("Top brands, highest move and profit:")
    for brand, score in top_brands_series.items():
        total_move = df[df[brand_column] == brand][move_column].sum()
        total_profit = df[df[brand_column] == brand][profit_column].sum()
        print(f"Brand: {brand}, Total MOVE: {total_move}, Total PROFIT: {total_profit}, Score (Move + Profit): {score}")
    return top_brands

def filter_by_brands(df, brand_column, top_brands):
    return df[df[brand_column].isin(top_brands)]

def write_data(df, csv_file):
    df.to_csv(csv_file, index=False)
"""
def read_data(csv_file):
    return pd.read_csv(csv_file)

def get_top_brands(df, brand_column, move_column, profit_column, n_brands=5):
    score_column = "move and profit score"
    df[score_column] = df[move_column] + df[profit_column]
    top_brands_series = df.groupby(brand_column)[score_column].sum().nlargest(n_brands)
    top_brands = top_brands_series.index.tolist()
    print("Top brands, highest move and profit:")
    for brand, score in top_brands_series.items():
        total_move = df[df[brand_column] == brand][move_column].sum()
        total_profit = df[df[brand_column] == brand][profit_column].sum()
        print(f"Brand: {brand}, Total MOVE: {total_move}, Total PROFIT: {total_profit}, Score (Move + Profit): {score}")
    return top_brands

def filter_by_brands(df, brand_column, top_brands, group_columns = ["WEEK", "STORE", "CITY", "ZIP", "BRAND"]):
    # Change brands that are not in top_brands to 'OTHERS'
    df.loc[~df[brand_column].isin(top_brands), brand_column] = 'OTHERS'
    
    # Group by the defined columns and compute the sum for each group
    df_grouped = df.groupby(group_columns).sum().reset_index()
    
    return df_grouped


def write_data(df, csv_file):
    df.to_csv(csv_file, index=False)

In [112]:
file_main = "data_clean/shampoo_sale_data_branded.csv"
file_top5 = "data_clean/shampoo_sale_data_top5.csv"
brand_column = "BRAND"
move_column = "MOVE"
profit_column = "PROFIT"

df = read_data(file_main)
top_brands = top_brands = get_top_brands(df, brand_column, move_column, profit_column)
df = filter_by_brands(df, brand_column, top_brands)
write_data(df, file_top5)

Top brands, highest move and profit:
Brand: RAVE AERO, Total MOVE: 80490, Total PROFIT: 438633.52, Score (Move + Profit): 519123.52
Brand: SUAVE COND, Total MOVE: 84329, Total PROFIT: 388499.98, Score (Move + Profit): 472828.98
Brand: WHITE RAIN, Total MOVE: 45172, Total PROFIT: 409872.29, Score (Move + Profit): 455044.29
Brand: WHT RN, Total MOVE: 24819, Total PROFIT: 267167.47, Score (Move + Profit): 291986.47
Brand: SALON SELECT, Total MOVE: 24263, Total PROFIT: 170093.45, Score (Move + Profit): 194356.45


### Add month column and reorder the data
----

In [113]:
def add_month_year(input_file, output_file):
    # Read the CSV file into a DataFrame
    df = read_data(input_file)

    # Function to convert week encoding to month and year
    def week_to_month_year(week):
        start_date = datetime.strptime("1992-12-31", "%Y-%m-%d")
        week_middle_date = start_date + timedelta(weeks=(week - 173), days=3)  # Get the middle of the week
        return week_middle_date.month, week_middle_date.year

    # Convert "WEEK" to "MONTH" and "YEAR" using the week_to_month_year function
    df["MONTH"], df["YEAR"] = zip(*df["WEEK"].apply(week_to_month_year))

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_file, index=False)

    print(f"The file has been processed and saved as {output_file}")

input_file = "data_clean/shampoo_sale_data_top5.csv"
output_file = "data_clean/shampoo_sale_data_master_file.csv"

add_month_year(input_file, output_file)

The file has been processed and saved as data_clean/shampoo_sale_data_master_file.csv


In [114]:
def reorder_columns(input_file, output_file):
    # Read the CSV file into a DataFrame
    df = read_data(input_file)

    # Specify the desired column order
    column_order = ["WEEK", "MONTH", "YEAR", "STORE", "CITY", "ZIP", "BRAND", "MOVE", "PRICE", "QTY", "REVENUE", "PROFIT"]

    # Add any additional columns in the DataFrame that aren't specified in the column order
    column_order += [col for col in df.columns if col not in column_order]

    # Reorder the DataFrame's columns
    df = df.reindex(columns=column_order)
    
    # Save the reordered DataFrame to a new CSV file
    df.to_csv(output_file, index=False)
    print(f"The file has been processed and saved as {output_file}")

input_file = "data_clean/shampoo_sale_data_master_file.csv"
output_file = "data_clean/shampoo_sale_data_master_file.csv"

reorder_columns(input_file, output_file)

The file has been processed and saved as data_clean/shampoo_sale_data_master_file.csv


### Data in the set
----

**WEEK:** This denotes the week number when the sales occurred.

**MONTH:** This is the month when the sales occured.

**YEAR:** This is the year when the sales occured.

**STORE:** This is the unique identifier for each store location.

**CITY:** This is the city name for the stores location.

**ZIP:** This is the zip code for the stores location.

**BRAND:** This is the grouped product description

**MOVE:** This is the number of units of the product that were sold.

**PRICE:** This is the retail price of the product.

**QTY:** This is the number of items bundled together.

**CASE:** This is the number of items in a case.

**COSMETIC, HABA, PHARMACY:** These variables are to be related to the different categories of products, and Sales in Dollars.

**INCOME:** This is the gross income, cents on the dollar for each item sold.

**HSIZEAVG, HSIZE1, HSIZE2, HSIZE34, HHLARGE:** These variables seem to represent household sizes.

**SINGLE, RETIRED, UNEMP, WORKWOM, WRKCH5, WRKCH17, NWRKCH5, NWRKCH17, WRKCH, NWRKCH, WRKWNCH:** These variables represent different demographics related to employment and family status.