## Assignment
[Assignment Notes](https://docs.google.com/document/d/1vr6wHlQXA5iP5X3RzxpU3T7PHGUkQzULitxvV83Pf7Q/edit)

[Assignment Data Notes](https://docs.google.com/document/d/1DtMgMeEk8tcsI1nWqR_ika_bmDg1ADpy64as7zDqqHc/edit)

### Setup
----
##### !! NOTE !!
To run this code you need the data from [chicagobooth](https://www.chicagobooth.edu/research/kilts/datasets/dominicks), specifically the data for shampoo.

The filles needed is:

**Customoer Count File** - (ccount(stata).zip)

**Store-Level Demographics File** - (demo(stata).zip)

And from the **category file** you need to find **shampoo**, and download **UPC.csv File** and **Movement.csv File**

And all of this needs to be saved in a folder called **"data"** in the same folder you are running this script from, running the **"create_folders" function** will automatically create the folder

In [2]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import requests
import os
import chardet
import time
import csv
import gc
import re

from datetime import datetime, timedelta
from functools import reduce

from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.ticker as mticker
import matplotlib.pylab as pylab

import sympy as sp
from sympy.solvers import solve

from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression

from cycler import cycler

# custome plot style
params  = {
"lines.linewidth": 1.5,

"legend.fancybox": "true",

"axes.prop_cycle": cycler('color', ["#ffa822","#1ac0c6","#ff6150","#30B66A","#B06AFF","#FF21E1"]),
"axes.facecolor": "#2b2b2b",
"axes.axisbelow": "true",
"axes.grid": "true",
"axes.edgecolor": "#2b2b2b",
"axes.linewidth": 0.5,
"axes.labelpad": 0,

"patch.edgecolor": "#2b2b2b",
"patch.linewidth": 0.5,

"grid.linestyle": "--",
"grid.linewidth": 0.5,
"grid.color": "#b8aba7",

"xtick.major.size": 0,
"xtick.minor.size": 0,
"ytick.major.size": 0,
"ytick.minor.size": 0,

"font.family":"monospace",
"font.size":10.0,
"text.color": "#FFE9E3",
"axes.labelcolor": "#b8aba7",
"xtick.color": "#b8aba7",
"ytick.color": "#b8aba7",

"savefig.edgecolor": "#2b2b2b",
"savefig.facecolor": "#2b2b2b",

"figure.subplot.left": 0.08,
"figure.subplot.right": 0.95,
"figure.subplot.bottom": 0.09,
"figure.facecolor": "#2b2b2b"}

pylab.rcParams.update(params)
print("finish")

finish


In [3]:
# Collect files from folder, if file type equals file_type
def get_files(folder, file_type):
    file_paths = []
    for file in os.listdir(folder):
        if file.endswith(file_type):
            file_paths.append([os.path.join(folder, file), file_type])
    return file_paths

def get_encoder(file_path, chunksize = 10_000):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(chunksize))
    return result['encoding']

# NOTE TO SELF, STRAGE ERROR HERE, MIGHT BE A DUPLICATE ERROR
# Load, filter in chunks and Convert to csv
def load_and_filter_file(input_file, temp_path, filter_func:list, file_type=".csv", chunksize=10_000, new_file_name = ""):
    temp_file = os.path.join(temp_path,new_file_name)
    if new_file_name == "":
        # Extract the file name from the input_file path
        input_file_name = os.path.basename(input_file)

        # Create a temp_file path by combining temp_path and input_file_name
        file_name_without_ext, file_ext = os.path.splitext(input_file_name)
        temp_file = os.path.join(temp_path, f"{file_name_without_ext}_temp.csv")
        
    # Had to fix the decoding because 'invalid continuation byte' that utf-8 can't decode. And manual attempt to fix it did not reveal byte 0xd5
    encodings = ["utf-8", "ISO-8859-1", "cp1252", "latin1"]
    success = False

    for encoding in encodings:
        try:
            if file_type.lower() == '.csv':
                reader = pd.read_csv(input_file, chunksize=chunksize, encoding=encoding)
            elif file_type.lower() == '.dta':
                reader = pd.read_stata(input_file, chunksize=chunksize)
            else:
                raise ValueError("Unsupported file type. Supported types are 'csv' and 'dta'.")

            for i, chunk in enumerate(reader):
                filtered_chunk = chunk
                for func in filter_func:
                    filtered_chunk = func(filtered_chunk)
                if i == 0:
                    filtered_chunk.to_csv(temp_file, index=False, mode='w')
                else:
                    filtered_chunk.to_csv(temp_file, index=False, mode='a', header=False)

            success = True
            print(f"Succes with the encoding '{encoding}', file {temp_file} now created")
            break

        except UnicodeDecodeError:
            print(f"Failed to read the file with encoding '{encoding}', trying the next one...")

    if not success:
        raise ValueError("None of the attempted encodings were successful in reading the file.")
            
# Merge csv files
"""
def merge_csv_files(file1, file2, output_file, merge_on=None, merge_dtype=None, chunksize=10000):
    if (merge_on is not None) and (type(merge_on) != list):  # Fix the condition here
        merge_on = [merge_on]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
        writer = None
        for chunk1 in pd.read_csv(file1, chunksize=chunksize):
            for chunk2 in pd.read_csv(file2, chunksize=chunksize):
                if merge_dtype is not None:
                    for column in merge_on:
                        chunk1[column] = chunk1[column].astype(merge_dtype)
                        chunk2[column] = chunk2[column].astype(merge_dtype)

                merged_chunk = pd.merge(chunk1, chunk2, on=merge_on) if merge_on else pd.concat([chunk1, chunk2], axis=1)

                if writer is None:
                    writer = csv.DictWriter(f_out, fieldnames=merged_chunk.columns)
                    writer.writeheader()

                for row in merged_chunk.to_dict(orient='records'):
                    writer.writerow(row)
"""

def merge_csv_files(file1, file2, output_file, merge_on= None, merge_dtype=None, chunksize =10000):
    if (merge_on is not None) and (type(merge_on) != list):  # Fix the condition here
        merge_on = [merge_on]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
        writer = None
        
        # Read the entire file2 into memory
        file2_data = pd.read_csv(file2)
        
        if merge_dtype is not None:
            for column in merge_on:
                file2_data[column] = file2_data[column].astype(merge_dtype)
        
        for chunk1 in pd.read_csv(file1, chunksize=chunksize):
            if merge_dtype is not None:
                for column in merge_on:
                    chunk1[column] = chunk1[column].astype(merge_dtype)

            merged_chunk = pd.merge(chunk1, file2_data, on=merge_on) if merge_on else pd.concat([chunk1, file2_data], axis=1)

            if writer is None:
                writer = csv.DictWriter(f_out, fieldnames=merged_chunk.columns)
                writer.writeheader()

            for row in merged_chunk.to_dict(orient='records'):
                writer.writerow(row)
                

                    
# Run time test function
def time_function(func, *args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"{func.__name__} took {elapsed_time:.2f} seconds to run.")
    return result

# Folder check and creation 
def create_folders(folder_paths):
    for folder_path in folder_paths:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            print(f"Folder created: {folder_path}")
        else:
            print(f"Folder already exists: {folder_path}")

In [4]:
# List of filters
def empty_filter_func(chunk):
    # Empty filter for test
    return chunk

def filter_empty_to_0_and_dropna(chunk):
    # Replace empty values with 0 if the column is numeric, otherwise with NaN
    for col in chunk.columns:
        if pd.api.types.is_numeric_dtype(chunk[col]):
            chunk[col] = chunk[col].replace('', 0).fillna(0)
        else:
            chunk[col] = chunk[col].replace('', np.nan)
    
    # Drop rows with NaN values
    filtered_chunk = chunk.dropna()
    
    return filtered_chunk

def filter_remove_empty_and_nan(chunk):
    filtered_chunk = chunk.replace('', np.nan)
    # Remove rows with NaN values from the DataFrame
    filtered_chunk = filtered_chunk.dropna()
    return filtered_chunk

def filter_header_up(chunk):
    # Turn header to upper
    chunk.columns = map(str.upper, chunk.columns)
    return chunk

def filter_week(chunk):
    # The data begins from week 128 (02/20/92). To filter for the year 1993, we select week 173 to 224. Based on Dominicks Manual and Codebook - Part 8: Week’s Decode Table
    # 173 = 12/31/92 to 01/06/93
    # 124 = 12/23/93 to 12/29/93
    start_week = 173
    end_week = 224

    # Create a copy of the chunk to avoid the warning
    chunk_copy = chunk.copy()

    # Modify the 'WEEK' column in the copied chunk
    chunk_copy['WEEK'] = chunk_copy['WEEK'].astype(int)

    filtered_chunk = chunk_copy[(chunk_copy['WEEK'] >= start_week) & (chunk_copy['WEEK'] <= end_week)]
    return filtered_chunk

def filter_move_above_one(chunk):
    filtered_chunk = chunk[(chunk['MOVE'] > 0)]
    return filtered_chunk

def filter_out_bad_data(chunk):
    filtered_chunk = chunk[(chunk['OK'] > 0)]
    return filtered_chunk

def filter_keep_columns(chunk, columns_to_keep = ["STORE", "COSMETIC", "HABA","PHARMACY","WEEK"]):
    # Keep only the specified columns in the DataFrame
    filtered_chunk = chunk[columns_to_keep]
    return filtered_chunk

def remove_column(chunk):
    column_names = ["GINI","LIFT5","RATIO5"]
    for column_name in column_names:
        if column_name in chunk.columns:
            chunk = chunk.drop(column_name, axis=1)
    return chunk

def combine_same_week_and_store(chunk):
    # Combine rows with the same value in the "WEEK" column
    combined_chunk = chunk.groupby(["WEEK","STORE"]).sum().reset_index()
    return combined_chunk

def filter_remove_symb_in_description(chunk):
    # Combine rows with the same value in the "WEEK" column
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r'&', ' ANNNND ', x))
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r'[^\w]', '', x))
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r' ANNNND ', '&', x))
    chunk["DESCRIP"] = chunk["DESCRIP"].apply(lambda x: re.sub(r' +', ' ', x))
    return chunk

def filter_clean_text_data(chunk, column = "DESCRIP"):
    # Convert to uppercase, remove symbols, and remove extra spaces
    chunk[column] = (chunk[column].str.upper()
                                  .str.replace(r"\W+", " ", regex=True)
                                  .str.replace(r"\s{2,}", " ", regex=True)
                                  .str.strip()
                                  .str.replace(r"(?<!\S)\S(?!\S)", "", regex=True))
    return chunk


### Filter and Clean Data
---

In [5]:
folder_path = "data/" # folder with data
folder_path_temp = "data_temp/" # folder with temp data
folder_path_clean = "data_clean/" # folder with clean
chunk_size = 10_000 # chunks of data loaderd in memory

create_folders([folder_path, folder_path_temp,folder_path_clean])

file_paths = get_files(folder_path,".csv")
file_paths.extend(get_files(folder_path,".dta"))

print(file_paths)

Folder already exists: data/
Folder already exists: data_temp/
Folder already exists: data_clean/
[['data/upcsha.csv', '.csv'], ['data/wsha.csv', '.csv'], ['data/ccount.dta', '.dta'], ['data/demo.dta', '.dta']]


#### upc data

In [6]:
filter_func_list = [filter_header_up,filter_remove_empty_and_nan,filter_clean_text_data]
time_function(load_and_filter_file, file_paths[0][0], folder_path_temp, filter_func_list, file_type=file_paths[0][1], chunksize = chunk_size)

# Can add and clean description and add brands here.

Failed to read the file with encoding 'utf-8', trying the next one...
Succes with the encoding 'ISO-8859-1', file data_temp/upcsha_temp.csv now created
load_and_filter_file took 0.06 seconds to run.


#### walk data

In [7]:
filter_func_list = [filter_header_up,filter_remove_empty_and_nan,filter_week,filter_move_above_one,filter_out_bad_data]
time_function(load_and_filter_file, file_paths[1][0], folder_path_temp, filter_func_list, file_type=file_paths[1][1], chunksize = chunk_size)

Succes with the encoding 'utf-8', file data_temp/wsha_temp.csv now created
load_and_filter_file took 38.84 seconds to run.


#### custumer count

In [8]:
filter_func_list = [filter_header_up,filter_remove_empty_and_nan,filter_week,filter_keep_columns, combine_same_week_and_store]
time_function(load_and_filter_file, file_paths[2][0], folder_path_temp, filter_func_list, file_type=file_paths[2][1], chunksize = chunk_size)

Succes with the encoding 'utf-8', file data_temp/ccount_temp.csv now created
load_and_filter_file took 1.74 seconds to run.


#### demo

In [9]:
filter_func_list = [filter_header_up, remove_column,filter_empty_to_0_and_dropna]
time_function(load_and_filter_file, file_paths[3][0], folder_path_temp, filter_func_list, file_type=file_paths[3][1], chunksize = chunk_size)

Succes with the encoding 'utf-8', file data_temp/demo_temp.csv now created
load_and_filter_file took 0.41 seconds to run.


### Merge Data
----

In [10]:
merge_file_main = "data_temp/wsha_temp.csv"
merge_file_sec = "data_temp/upcsha_temp.csv"
merge_file_out = "data_temp/wsha_upcsha.csv"
time_function(merge_csv_files,merge_file_main, merge_file_sec, merge_file_out, merge_on="UPC", merge_dtype=np.int64, chunksize=chunk_size)

merge_csv_files took 3.86 seconds to run.


In [11]:
merge_file_main = "data_temp/wsha_upcsha.csv"
merge_file_sec = "data_temp/ccount_temp.csv"
merge_file_out = "data_temp/wsha_upcsha_ccount.csv"
time_function(merge_csv_files,merge_file_main, merge_file_sec, merge_file_out, merge_on=["STORE","WEEK"], merge_dtype=np.int64, chunksize=chunk_size)

merge_csv_files took 4.65 seconds to run.


In [12]:
merge_file_main = "data_temp/wsha_upcsha_ccount.csv"
merge_file_sec = "data_temp/demo_temp.csv"
merge_file_out = "data_temp/wsha_upcsha_ccount_demo.csv"
time_function(merge_csv_files,merge_file_main, merge_file_sec, merge_file_out, merge_on="STORE", merge_dtype=np.int64, chunksize=chunk_size)

merge_csv_files took 116.83 seconds to run.


### Clean Merge Data and Select Relevant Column
----

In [53]:
selected_columns = ["STORE", "CITY", "WEEK","MOVE","PRICE","QTY","PROFIT","DESCRIP","CASE","COSMETIC","HABA","PHARMACY","INCOME","HSIZEAVG","HSIZE1","HSIZE2",
                    "HSIZE34","HHLARGE","SINGLE","RETIRED","UNEMP","WORKWOM","WRKCH5","WRKCH17","NWRKCH5","NWRKCH17","WRKCH","NWRKCH","WRKWNCH"]

def filter_select_columns(chunk, columns_to_keep = selected_columns):
    # Keep only the specified columns in the DataFrame
    filtered_chunk = chunk[columns_to_keep]
    return filtered_chunk

def filter_upc(chunk):
    filtered_chunk = chunk[(chunk["UPC"].isin(upc_list[-1]))]
    return filtered_chunk

def create_sales_column(chunk):
    # Create a temporary DataFrame to avoid unkown SettingWithCopyWarning
    # NOTE I still get the error, tho the code works. 
    # Error is most likly panda not knowing wheter I want to create a copy or work with the main chunk. 
    tmp = chunk['PRICE'] * chunk["MOVE"] / chunk['QTY']
    
    # Assign the temporary DataFrame to the new column 'SALES'
    chunk = chunk.assign(SALES=tmp)

    return chunk


filter_func_list = [filter_select_columns,filter_create_sales_column]
file_main = "data_temp/wsha_upcsha_ccount_demo.csv"
file_out = "shampoo_sale_data.csv"
time_function(load_and_filter_file, file_main, folder_path_clean, filter_func_list, chunksize = chunk_size, new_file_name = file_out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['SALES'] = chunk['PRICE'] * chunk['MOVE'] / chunk['QTY']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['SALES'] = chunk['PRICE'] * chunk['MOVE'] / chunk['QTY']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['SALES'] = chunk['PRICE'] * chunk['MOVE'] / chunk['QTY']
A value is tryin

Succes with the encoding 'utf-8', file data_clean/shampoo_sale_data.csv now created
load_and_filter_file took 13.96 seconds to run.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['SALES'] = chunk['PRICE'] * chunk['MOVE'] / chunk['QTY']


In [54]:
def group_data(input_file, column, chunksize=10_000):
    groups = {}
    # Read the file in chunks
    for chunk in pd.read_csv(input_file, chunksize=chunksize):
        # Iterate over the cleaned data
        for index, row in chunk.iterrows():
            # Check if the description is missing
            if pd.isna(row[column]):
                continue
                
            # Split the description into words
            words = row[column].split()

            # Define the key as the first word or a group of similar words
            key = words[0]

            # Add the description to the corresponding group
            if key not in groups:
                groups[key] = []
            groups[key].append(row[column])

    return groups

keyword_brand_grouping = time_function(group_data,"data_clean/shampoo_sale_data.csv", "DESCRIP", chunksize=chunk_size)
#display(keyword_brand_grouping)

group_data took 7.41 seconds to run.


In [55]:
def group_by_keyword(input_file, output_file, column, keyword_dict, group_columns=["WEEK", "STORE"], chunksize=10_000):
    # Define a function to map each description to its keyword
    def map_to_keyword(description):
        if isinstance(description, str):
            words = description.split()
            for word in words:
                if word in keyword_dict:
                    return word
        return description

    # Define a function to process each chunk of the DataFrame
    def process_chunk(chunk):
        chunk["BRAND"] = chunk[column].apply(map_to_keyword)
        grouped = chunk.groupby(group_columns + ["BRAND"]).sum(numeric_only=True).reset_index()
        return grouped

    # Process the DataFrame in chunks and concatenate the results
    grouped_chunks = [process_chunk(chunk) for chunk in pd.read_csv(input_file, chunksize=chunksize)]
    result = pd.concat(grouped_chunks)

    # Save the result to a new CSV file
    result.to_csv(output_file, index=False)

    print(f"The file has been processed and saved as {output_file}")

# Example usage
input_file = "data_clean/shampoo_sale_data.csv"
output_file = "data_clean/shampoo_sale_data_brand.csv"
column = "DESCRIP"
group_columns = ["STORE", "WEEK"]

time_function(group_by_keyword,input_file, output_file, column, keyword_brand_grouping, group_columns, chunksize=chunk_size)

The file has been processed and saved as data_clean/shampoo_sale_data_brand.csv
group_by_keyword took 1.68 seconds to run.


### Might be old code, cant remember
----

In [42]:
"""
def top_n_values_with_names(input_file, column_number, column_name, n=5):
    df = pd.read_csv(input_file)

    # Sort the DataFrame based on the values in the specified column_number
    sorted_df = df.sort_values(by=column_number, ascending=False)

    # Get the top N rows from the sorted DataFrame
    top_n_rows = sorted_df.head(n)

    # Create a list with the top N values and their corresponding names from column_name
    result = top_n_rows[[column_number, column_name]].values.tolist()

    return result

input_file = 'data_clean/shampoo_sale_data_brand.csv'
column_number = 'MOVE'
column_name = 'BRAND'

n = 5
top_n_values = top_n_values_with_names(input_file, column_number,column_name, n)
display(top_n_values)

upc_list = [[row[i] for row in top_n_values] for i in range(2)] #top_n_values reordered to [[move...],[descrip...],[upc...]]
display(upc_list)
"""

[[530, 'FLEX'], [514, 'RAVE'], [508, 'SUAVE'], [475, 'SUAVE'], [437, 'SUAVE']]

[[530, 514, 508, 475, 437], ['FLEX', 'RAVE', 'SUAVE', 'SUAVE', 'SUAVE']]

In [46]:
"""
def process_and_group_rows(input_file, output_file, column, merge_on_columns):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Remove any symbols or spaces at the first position in the specified column
    df[column] = df[column].apply(lambda x: re.sub(r'^[^\w\d]*', '', x))

    # Extract keywords and create a dictionary of unique keywords and their corresponding values
    keywords = set(df[column].apply(lambda x: x.split('_')[0].split()[0]))
    keyword_groups = {}
    for keyword in keywords:
        values = []
        for value in df[column].unique():
            if value.startswith(keyword):
                values.append(value)
                continue
            split_value = value.split()
            for idx, word in enumerate(split_value):
                if keyword == ' '.join(split_value[:idx + 1]):
                    values.append(value)
                    break
        keyword_groups[keyword] = values

    # Group rows based on the keyword and merge them under a new value named after the keyword
    grouped_rows = []
    for keyword, values in keyword_groups.items():
        keyword_rows = df[df[column].isin(values)].copy()  # Create a copy of the filtered DataFrame
        keyword_rows.loc[:, column] = keyword  # Use .loc[] to set the value
        group_columns = merge_on_columns + [column]
        merged_rows = keyword_rows.groupby(group_columns, as_index=False).sum(numeric_only=True)
        grouped_rows.append(merged_rows)

    # Concatenate grouped rows and save the result to a new CSV file
    result = pd.concat(grouped_rows)
    result.to_csv(output_file, index=False)

    print(f"The file has been processed and saved as {output_file}")


input_file = "data_clean/shampoo_sale_data.csv"
output_file = "data_clean/shampoo_sale_data_test.csv"
column = "DESCRIP"
merge_on_columns = ["STORE", "WEEK"]

time_function(process_and_group_rows,input_file, output_file, column, merge_on_columns)
"""
"""
def get_common_start(strings):
    #Find the common starting substring in a list of strings.
    if not strings:
        return ''
    
    # Sort the strings and compare the first and last one
    strings = sorted(strings)
    first = strings[0]
    last = strings[-1]
    common_start = []
    
    for char1, char2 in zip(first, last):
        if char1 == char2:
            common_start.append(char1)
        else:
            break  # Stop at the first mismatch
    
    return ''.join(common_start)

def process_and_group_rows(input_file, output_file):
    df = pd.read_csv(input_file)

    # Clean the 'DESCRIP' column
    df['DESCRIP'] = df['DESCRIP'].str.upper().str.replace(r'\W+', ' ', regex=True)

    # Extract first word as the main category
    df['group'] = df['DESCRIP'].str.split().str[0]

    # Group the DataFrame by 'group', 'STORE', and 'WEEK' and calculate the sum
    grouped_df = df.groupby(['group', 'STORE', 'WEEK'], as_index=False).sum()

    # Write the result to a new CSV file
    grouped_df.to_csv(output_file, index=False)

    print(f"The file has been processed and saved as {output_file}")


input_file = "data_clean/shampoo_sale_data.csv"
output_file = "data_clean/shampoo_sale_data_test.csv"
column = "DESCRIP"
merge_on_columns = ["STORE", "WEEK"]

time_function(process_and_group_rows,input_file, output_file)
"""

  grouped_df = df.groupby(['group', 'STORE', 'WEEK'], as_index=False).sum()


The file has been processed and saved as data_clean/shampoo_sale_data_test.csv
process_and_group_rows took 1.94 seconds to run.


### Weekly Raport
----

#### Filter and create a data set for a weekly report

In [56]:
def most_frequent_value(csv_file, store_column, week_column):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Calculate the frequency of each value in the specified store column
    store_value_counts = df[store_column].value_counts()
    most_frequent_store = store_value_counts.idxmax()
    store_frequency = store_value_counts[most_frequent_store]

    # Filter the DataFrame to keep only the most frequent store
    store_df = df[df[store_column] == most_frequent_store]

    # Calculate the frequency of each value in the specified week column
    week_value_counts = store_df[week_column].value_counts()
    most_frequent_week = week_value_counts.idxmax()
    week_frequency = week_value_counts[most_frequent_week]
    
    print(f"The {store_column} with the highest frequency is {most_frequent_store}, with a frequency of {store_frequency}")
    print(f"The {week_column} with the highest frequency within the store {most_frequent_store} is {most_frequent_week}, with a frequency of {week_frequency}")
    return most_frequent_store, most_frequent_week

# Example usage
file_main = "data_clean/shampoo_sale_data_brand.csv"
store_column = "STORE"
week_column = "WEEK"
most_frequent_store, most_frequent_week = most_frequent_value(file_main, store_column, week_column)


The STORE with the highest frequency is 71, with a frequency of 901
The WEEK with the highest frequency within the store 71 is 210, with a frequency of 56


In [57]:
def filter_select_week(chunk):
    chunk_copy = chunk.copy()

    # Modify the 'WEEK' column in the copied chunk
    chunk_copy['WEEK'] = chunk_copy['WEEK'].astype(int)

    filtered_chunk = chunk_copy[(chunk_copy['WEEK'] >= most_frequent_week-1) & (chunk_copy['WEEK'] <= most_frequent_week)]
    return filtered_chunk

def filter_select_store(chunk):
    chunk_copy = chunk.copy()

    # Modify the 'WEEK' column in the copied chunk
    chunk_copy['STORE'] = chunk_copy['STORE'].astype(int)

    filtered_chunk = chunk_copy[(chunk_copy['STORE'] == most_frequent_store)]
    return filtered_chunk

filter_func_list = [filter_select_store,filter_select_week]
file_main = "data_clean/shampoo_sale_data_brand.csv"
file_out = "week_shampoo_sale_data.csv"
time_function(load_and_filter_file, file_main, folder_path_clean, filter_func_list, chunksize = chunk_size, new_file_name = file_out)

Succes with the encoding 'utf-8', file data_clean/week_shampoo_sale_data.csv now created
load_and_filter_file took 0.20 seconds to run.


#### Create graph for weekly sales

#### Compeare weekly sales, with last week

#### Compeare sales area

#### Compear brands

In [None]:
#### Create graph for weekly sales

### Monthly Report
----

#### Filter and create a data set for a monthly report

In [58]:
def group_by_week_and_convert_to_month(input_file, output_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Group the DataFrame by "WEEK", "UPC", and "STORE"
    grouped_df = df.groupby(["WEEK", "BRAND", "STORE"]).sum(numeric_only=True).reset_index()

    # Function to convert week encoding to month
    def week_to_month(week):
        start_date = datetime.strptime("1992-12-31", "%Y-%m-%d")
        week_173_date = start_date + timedelta(weeks=(week - 173))
        return week_173_date.month

    # Convert "WEEK" to "MONTHS" using the week_to_month function
    grouped_df["MONTHS"] = grouped_df["WEEK"].apply(week_to_month)

    # Drop the "WEEK" column
    grouped_df = grouped_df.drop(columns=["WEEK"])
    
    # Reorder columns to move "MONTHS" to the right of "STORE"
    new_column_order = ['BRAND', 'STORE', 'MONTHS'] + [col for col in grouped_df.columns if col not in ['BRAND', 'STORE', 'MONTHS']]
    grouped_df = grouped_df.reindex(columns=new_column_order)

    # Save the grouped and modified DataFrame to a new CSV file
    grouped_df.to_csv(output_file, index=False)

    print(f"The file has been grouped and saved as {output_file}")
    
    
input_file = "data_clean/shampoo_sale_data_brand.csv"
output_file = "data_clean/month_shampoo_sale_data.csv"

time_function(group_by_week_and_convert_to_month,input_file, output_file)

The file has been grouped and saved as data_clean/month_shampoo_sale_data.csv
group_by_week_and_convert_to_month took 1.36 seconds to run.


#### Filter out to a monthly raport
This is a monthly raport for the corporate management over all the stores.

merge_csv_files took 1.78 seconds to run.
