In [5]:
import pandas as pd
from datetime import datetime
from google.colab import files
import io
import openpyxl # Required for pandas to read .xlsx files

def upload_file_and_read():
    """Prompts the user to upload an Excel/CSV file and returns its data as a DataFrame."""
    try:
        print("Please upload your Excel or CSV file (e.g., 'web-try-Apps-site-usage.xlsx').")
        uploaded = files.upload()

        # Get the filename of the uploaded file.
        filename = next(iter(uploaded))
        print(f"File '{filename}' uploaded successfully!")

        # Determine file type and read accordingly
        if filename.lower().endswith(('.xlsx', '.xls')):
            # For Excel files, read raw data without inferring headers
            # We'll handle headers manually to capture unnamed columns
            df = pd.read_excel(io.BytesIO(uploaded[filename]), header=None)
        elif filename.lower().endswith('.csv'):
            # For CSV, read raw data without inferring headers
            df = pd.read_csv(io.BytesIO(uploaded[filename]), header=None)
        else:
            raise ValueError("Unsupported file format. Please upload an .xlsx, .xls, or .csv file.")

        return df, filename
    except Exception as e:
        print(f"An error occurred during file upload or reading: {e}")
        return None, None

def process_and_fix_data(df):
    """
    Fixes column alignment and data transfer issues in the DataFrame.
    Specifically:
    - Removes the 'Icon' column.
    - Moves 'Status' data (from what pandas might call 'Unnamed: 3' or similar) to the correct 'Status' column.
    - Moves 'Activity Duration' data (from the column currently named 'Status') to the correct 'Activity Duration' column.
    """
    if df is None or df.empty:
        print("No data to process.")
        return None

    # Step 1: Extract original headers (first row of the DataFrame)
    original_headers = df.iloc[0].tolist()
    # Replace None/NaN headers with a consistent placeholder for indexing
    # For example, if original_headers is ['Employee', 'Icon', 'App/Site Name', None, 'Status', 'Activity Duration']
    # We want to identify the None.
    processed_headers = []
    for i, header in enumerate(original_headers):
        if pd.isna(header) or header is None:
            processed_headers.append(f'Unnamed: {i}')
        else:
            processed_headers.append(header)

    # Assign these processed headers to the DataFrame
    df.columns = processed_headers
    # Remove the header row from the DataFrame
    df = df[1:].copy()
    # Reset index after removing the header row
    df.reset_index(drop=True, inplace=True)

    print(f"Detected columns after initial read and header assignment: {df.columns.tolist()}")

    # Define the expected final columns and their source in the raw data
    # Based on 'web-try-Apps-site-usage.xlsx - Sheet1.csv':
    # - 'Employee' -> 'Employee' (original)
    # - 'App/Site Name' -> 'App/Site Name' (original)
    # - 'Status' (actual data, e.g., 'neutral') -> 'Unnamed: 3' (the 4th column, index 3)
    # - 'Activity Duration' (actual data, e.g., '00h 01m') -> 'Status' (the column explicitly named 'Status')

    fixed_df = pd.DataFrame()

    # Map Employee and App/Site Name directly
    if 'Employee' in df.columns:
        fixed_df['Employee'] = df['Employee'].astype(str).str.strip() + ' '
    else:
        print("Warning: 'Employee' column not found.")
        fixed_df['Employee'] = ''

    if 'App/Site Name' in df.columns:
        fixed_df['App/Site Name'] = df['App/Site Name'].astype(str)
    else:
        print("Warning: 'App/Site Name' column not found.")
        fixed_df['App/Site Name'] = ''

    # Realign 'Status' data from 'Unnamed: 3'
    # Check for the presence of 'Unnamed: 3' which is common for the 4th column if unnamed
    if 'Unnamed: 3' in df.columns:
        fixed_df['Status'] = df['Unnamed: 3'].astype(str)
    else:
        print("Warning: 'Unnamed: 3' (expected Status column) not found. 'Status' column might be empty.")
        fixed_df['Status'] = '' # Ensure column exists

    # Realign 'Activity Duration' data from the column originally named 'Status'
    if 'Status' in df.columns:
        fixed_df['Activity Duration'] = df['Status'].astype(str)
    else:
        print("Warning: Original 'Status' column (expected Activity Duration) not found. 'Activity Duration' column might be empty.")
        fixed_df['Activity Duration'] = '' # Ensure column exists


    # Ensure the final columns are exactly as required, in the correct order
    final_columns_order = ['Employee', 'App/Site Name', 'Status', 'Activity Duration']
    fixed_df = fixed_df[final_columns_order]

    return fixed_df

# --- Main execution ---
df, input_filename = upload_file_and_read()

if df is not None:
    print("\nOriginal Data Preview:")
    print(df.head())
    print("\nAttempting to fix data alignment...")

    fixed_df = process_and_fix_data(df)

    if fixed_df is not None:
        # Generate the output filename with today's date
        today_date = datetime.now().strftime('%Y-%m-%d')
        output_filename = f'Fixed-App-Site-Usage-{today_date}.xlsx'

        try:
            # Save the cleaned DataFrame to a new Excel file in the Colab session.
            fixed_df.to_excel(output_filename, index=False)
            print(f"\nSuccess! The cleaned data has been saved to '{output_filename}'.")
            print("You can download this file from the left sidebar.")
            print("\nHere is a preview of the fixed data:")
            print(fixed_df.head())

            # Automatically download the new Excel file to your computer.
            files.download(output_filename)

        except Exception as e:
            print(f"An unexpected error occurred during data saving or download: {e}")

Please upload your Excel or CSV file (e.g., 'web-try-Apps-site-usage.xlsx').


Saving web-try-Apps-site-usage.xlsx to web-try-Apps-site-usage (4).xlsx
File 'web-try-Apps-site-usage (4).xlsx' uploaded successfully!

Original Data Preview:
              0     1                          2        3        4  \
0      Employee  Icon              App/Site Name      NaN   Status   
1  Prottoy Saha   NaN  login.microsoftonline.com  Neutral  00h 01m   
2  Prottoy Saha   NaN             www.typing.com  Neutral  00h 19m   
3  Prottoy Saha   NaN  colab.research.google.com  Neutral  00h 09m   
4  Prottoy Saha   NaN       m365.cloud.microsoft  Neutral  00h 01m   

                   5  
0  Activity Duration  
1                NaN  
2                NaN  
3                NaN  
4                NaN  

Attempting to fix data alignment...
Detected columns after initial read and header assignment: ['Employee', 'Icon', 'App/Site Name', 'Unnamed: 3', 'Status', 'Activity Duration']

Success! The cleaned data has been saved to 'Fixed-App-Site-Usage-2025-08-10.xlsx'.
You can download t

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>