In [2]:
import pandas as pd
import os
import re
from openpyxl import load_workbook
from datetime import datetime

In [3]:
#@title **Step 1: Clone the GitHub Repository**
# This step clones the required GitHub repository into the Colab environment.

# Define the path where the repository will be cloned
repo_path = '/content/James_Fareshare_Data'

# Check if the repository already exists
if not os.path.exists(repo_path):
    # Clone the repository if it doesn't exist
    !git clone https://github.com/HAUCommunityFridge/James_Fareshare_Data
else:
    # If the repository exists, pull the latest changes
    os.chdir(repo_path)
    !git pull

# Verify cloned repository
print("Repository contents:")
!ls /content/James_Fareshare_Data

Cloning into 'James_Fareshare_Data'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 15 (delta 2), reused 14 (delta 1), pack-reused 0[K
Receiving objects: 100% (15/15), 373.05 KiB | 6.91 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Repository contents:
data  docs  James_Fareshare_Data.Rproj	scripts


In [None]:
# Load data from multiple sheets
file_path = "/content/James_Fareshare_Data/data/Fareshare records from April to July plus.xlsx"

# Get sheet names
workbook = load_workbook(filename=file_path, read_only=True)
sheet_names = workbook.sheetnames

# Sheets to ignore (indices: 0, 1, 7, 10, 12, 13, 14, 15)
sheets_to_ignore = [0, 1, 7, 10, 12, 13, 14, 15]

# Exclude the specified sheets
sheets_to_load = [sheet for idx, sheet in enumerate(sheet_names) if idx not in sheets_to_ignore]

# Proper column names
column_names = ["Date", "Category", "Product.brand..name.and.type", "Barcode", "Quantity", "Unit",
                "Unit.weight", "Total.weight", "Unit.price", "Total.price", "Stock.code"]

# Helper function to standardize column names
def standardize_column_name(name):
    if isinstance(name, str):
        return re.sub(r'[^a-z]', '', name.lower())
    return name

# Function to load and clean each sheet
def clean_sheet(sheet_name):
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Convert all column values to string for searching
    data = data.applymap(str)

    # Standardize column names for searching
    standardized_columns = data.columns.to_series().apply(standardize_column_name)

    # Find the row number where "Product brand, name and type" or "Barcode" column starts
    header_row_index = data.apply(lambda row: row.str.contains("product brand, name and type|barcode", case=False, na=False)).any(axis=1)
    header_row_index = header_row_index[header_row_index].index
    if header_row_index.empty:
        print(f"Skipping sheet: {sheet_name} - Expected headers not found")
        print(f"First few rows of sheet {sheet_name}:\n", data.head())
        return None

    start_row = header_row_index[0]

    # Skip rows before the header
    data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=start_row + 1)

    # Standardize the column names again
    data.columns = [standardize_column_name(col) for col in data.columns]

    # Limit columns to the length of column_names
    data = data.iloc[:, :len(column_names)]

    # Print actual number of columns detected
    print(f"Sheet: {sheet_name}, Columns Detected: {len(data.columns)}")

    # Adjust the number of columns
    if len(data.columns) > len(column_names):
        data = data.iloc[:, :len(column_names)]
    elif len(data.columns) < len(column_names):
        data = data.reindex(columns=column_names[:len(data.columns)])

    # Rename columns
    data.columns = column_names[:len(data.columns)]

    # Ensure only the required columns are kept
    data = data[column_names[:len(data.columns)]]

    # Remove rows with all NaN values
    data = data.dropna(how='all')

    # Remove rows containing 'GBP'
    data = data.applymap(lambda x: pd.NA if isinstance(x, str) and 'GBP' in x else x)

    # Drop columns containing only NA values
    data = data.dropna(axis=1, how='all')

    # Convert Date column to datetime format
    if "Date" in data.columns:
        data['Date'] = pd.to_datetime(data['Date'], errors='coerce', dayfirst=True)

    # Add a column for the sheet name
    data['SheetName'] = sheet_name

    return data

# Load and clean all relevant sheets
data_list = [clean_sheet(sheet) for sheet in sheets_to_load]

# Remove NULL elements from the list
data_list = [data for data in data_list if data is not None]

# Combine all cleaned data into a single dataframe
consolidated_data = pd.concat(data_list, ignore_index=True)

# Remove duplicate rows
consolidated_data = consolidated_data.drop_duplicates()

# Save the cleaned data to a new Excel file
output_file_path = "/content/Cleaned_Fareshare_data.xlsx"
consolidated_data.to_excel(output_file_path, index=False)

print(f"Cleaned data saved to {output_file_path}")

# Display the first few rows of the cleaned dataframe
print(consolidated_data.head())

# Display the structure of the cleaned dataframe
print(consolidated_data.info())

  data = data.applymap(str)
  data = data.applymap(lambda x: pd.NA if isinstance(x, str) and 'GBP' in x else x)


Sheet: Hadley, Columns Detected: 11


  data = data.applymap(str)


Skipping sheet: Market Drayton (July) - Expected headers not found
First few rows of sheet Market Drayton (July):
             Unnamed: 0          Category  \
0  2024-07-01 00:00:00  Fresh vegetables   
1                  nan               nan   
2                  nan  Fresh vegetables   
3                  nan               nan   
4                  nan  Fresh vegetables   

                        Product brand, name and type          Barcode  \
0                Tesco Courgettes (C), Fruit and Veg  5057753915537.0   
1                                                nan              nan   
2  Tesco Flat Peach Mineral 3 Pack (C), Fruit and...        3259412.0   
3                                                nan              nan   
4      Tesco Iceberg Lettuce Each (C), Fruit and Veg        3043868.0   

  Quantity Unit Unit weight Total weight Unit price Total price  ...  \
0      3.0   Kg        0.67          2.0        1.5   4.50 GBP   ...   
1      nan  nan         nan          

  data = data.applymap(str)


Skipping sheet: Market Drayton (June) - Expected headers not found
First few rows of sheet Market Drayton (June):
   Unnamed: 0                              Category  \
0        NaT                                   nan   
1        NaT  Chilled products with dairy and eggs   
2        NaT                                   nan   
3        NaT  Chilled products with dairy and eggs   
4        NaT                                   nan   

                      Product brand, name and type          Barcode Quantity  \
0                                              nan              nan      nan   
1           Tesco Chicken Drumsticks 600G, Chilled  5054269155624.0      1.0   
2                                              nan              nan      nan   
3  Tesco British Whole Milk 568Ml, 1 Pint, Chilled  5031021057976.0      9.0   
4                                              nan              nan      nan   

  Unit Unit weight Total weight Unit price Total price  ...  \
0  nan         n

  data = data.applymap(str)


Sheet: Market Drayton (May), Columns Detected: 11


  data = data.applymap(lambda x: pd.NA if isinstance(x, str) and 'GBP' in x else x)
  data = data.applymap(str)
  data = data.applymap(lambda x: pd.NA if isinstance(x, str) and 'GBP' in x else x)


Sheet: Market Drayton (April), Columns Detected: 11
Sheet: Madeley, Columns Detected: 11


  data = data.applymap(str)
  data = data.applymap(lambda x: pd.NA if isinstance(x, str) and 'GBP' in x else x)
  data = data.applymap(str)


Skipping sheet: Malinslee - Expected headers not found
First few rows of sheet Malinslee:
             Unnamed: 0                              Category  \
0  2024-05-02 00:00:00  Chilled products with dairy and eggs   
1  2024-05-02 00:00:00                                   nan   
2  2024-05-02 00:00:00  Chilled products with dairy and eggs   
3  2024-05-02 00:00:00                                   nan   
4  2024-05-02 00:00:00                      Fresh vegetables   

                        Product brand, name and type          Barcode  \
0  Tesco British Chicken Breast Mini Fillets 400G...  5057753922771.0   
1                                                nan              nan   
2        Tesco Chicken Breast Portions 580G, Chilled  5057753600990.0   
3                                                nan              nan   
4    Tesco Sweet & Crunchy Salad 250G, Fruit and Veg        3336922.0   

  Quantity Unit Unit weight Total weight Unit price Total price  ...  \
0      1.0   

  data = data.applymap(str)


Cleaned data saved to /content/Cleaned_Fareshare_data.xlsx
        Date          Category  \
0 2024-05-01  Fresh vegetables   
1 2024-05-01               NaN   
2 2024-05-01  Fresh vegetables   
4 2024-05-01  Fresh vegetables   
6 2024-05-01  Fresh vegetables   

                        Product.brand..name.and.type       Barcode  Quantity  \
0             Tesco Mixed Raisins 60G, Fruit and Veg  3.424605e+06       8.0   
1                                                NaN           NaN       NaN   
2  Tesco Sour Cream Chive Flavoured Pulse & Nut M...  5.059697e+12       7.0   
4        Tesco Mixed Peppers 420G (C), Fruit and Veg  3.274880e+06       1.0   
6       Tesco Pink Lady Apple Pot 80G, Fruit and Veg  5.057753e+12       2.0   

  Unit  Unit.weight  Total.weight Unit.price Total.price SheetName  
0   Kg         0.06          0.51       1.15         9.2    Hadley  
1  NaN          NaN           NaN       <NA>         NaN    Hadley  
2   Kg         0.03          0.18       0.75    