<a href="https://colab.research.google.com/github/HazelRoma5347/CCS8/blob/main/CC19phase2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import io

# Ensure plots are displayed inline
%matplotlib inline

# Upload the file
uploaded = files.upload()

# Extract the file name
temp_file = list(uploaded.keys())[0]

# Function to read Excel file, convert specific rows and columns to categorical, and apply one-hot encoding
def apply_one_hot_encoding(file_content):
    # Read Excel file from the uploaded content
    df_dict = pd.read_excel(io.BytesIO(uploaded[file_content]), sheet_name=None)

    for sheet, df in df_dict.items():
        print(f"Raw Data from sheet: {sheet}")
        print(df.head())  # Display first few rows of the data
        print("Available columns:", df.columns.tolist())  # Display available columns

        # Clean the column names (strip leading/trailing spaces) for string columns
        df.columns = [col.strip() if isinstance(col, str) else col for col in df.columns]

        # Print cleaned column names to confirm
        print("Cleaned column names:", df.columns.tolist())

        # Select rows 37-66 (index 36-65 in 0-indexing) and columns A-O (index 0-14 in 0-indexing)
        selected_rows = df.iloc[282:306, 0:15]  # Row indices 282 to 306 (inclusive) and column indices 0 to 14 (A to O)

        # Display the selected portion of the data
        print("Selected data (rows 282-306, columns A-O):")
        display(selected_rows)

        # Check for missing values
        missing_values = selected_rows.isnull().sum()
        print(f"Missing values per column:\n{missing_values}")

        # Check for duplicates
        duplicates = selected_rows.duplicated().sum()
        print(f"Number of duplicate rows: {duplicates}")

        # Impute missing data (for example, fill with column mean for numeric columns)
        # Ensure numeric columns are treated as numeric types for imputation
        for col in selected_rows.select_dtypes(include=['float64', 'int64']).columns:
            selected_rows[col] = selected_rows[col].fillna(selected_rows[col].mean())

        # For categorical columns, use the mode (most frequent value) to fill missing values
        for col in selected_rows.select_dtypes(include=['object']).columns:
            selected_rows[col] = selected_rows[col].fillna(selected_rows[col].mode()[0])

        print("Imputed missing values with column mean for numeric columns and mode for categorical columns.")

        # Show number of rows and columns
        print(f"Number of rows: {selected_rows.shape[0]}")
        print(f"Number of columns: {selected_rows.shape[1]}")

        # Select categorical columns in the selected subset
        categorical_columns = selected_rows.select_dtypes(include=['object']).columns

        if len(categorical_columns) > 0:
            # Create a copy of the selected portion of the DataFrame for one-hot encoding
            selected_encoded = selected_rows.copy()

            for col in categorical_columns:
                # Apply one-hot encoding to each categorical column
                one_hot = pd.get_dummies(selected_encoded[col], prefix=f"Category_{col}")
                selected_encoded = pd.concat([selected_encoded, one_hot], axis=1)

            # Drop the original categorical columns
            selected_encoded = selected_encoded.drop(columns=categorical_columns)

            # Display the encoded data in table format
            print("One-Hot Encoded Data:")
            display(selected_encoded.head())  # Display the first few rows of the encoded DataFrame as a table
        else:
            print("No categorical columns found in the selected rows and columns.")

# Example usage
apply_one_hot_encoding(temp_file)



Saving P1.xlsx to P1 (1).xlsx
Raw Data from sheet: Sheet1
                                   Regional Overview     2010     2011  \
0                              Asia and the Pacific:      NaN      NaN   
1                     HIV Epidemic Metrics Overview:      NaN      NaN   
2  Epidemic transition metrics (Trend of AIDS-rel...   310000   280000   
3  Epidemic transition metrics (Trend of new HIV ...   350000   350000   
4                            People living with HIV:  5700000  5700000   

      2012     2013     2014     2015     2016     2017     2018     2019  \
0      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN   
1      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN   
2   260000   240000   230000   220000   200000   190000   180000   160000   
3   340000   320000   320000   320000   310000   310000   310000   300000   
4  5800000  5800000  5900000  5900000  6000000  6100000  6200000  6300000   

      2020     2021     2022     2

Unnamed: 0,Regional Overview,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
282,Stigma and Discrimination (2023) (Per National):,... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],318.99
283,Total Average of all Stigma and Discrimination...,... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],... [... - ...],318.99
284,,,,,,,,,,,,,,,
285,,,,,,,,,,,,,,,
286,,,,,,,,,,,,,,,
287,Regional Overview,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023.0
288,Global,,,,,,,,,,,,,,
289,HIV Epidemic Metrics Overview:,,,,,,,,,,,,,,
290,Epidemic transition metrics (Trend of AIDS-rel...,1300000,1200000,1100000,1000000,980000,930000,880000,840000,790000,760000,730000,700000,670000,630000.0
291,Epidemic transition metrics (Trend of new HIV ...,2100000,2100000,2000000,1900000,1900000,1800000,1800000,1700000,1600000,1500000,1500000,1400000,1400000,1300000.0


Missing values per column:
Regional Overview    5
2010                 9
2011                 9
2012                 9
2013                 9
2014                 9
2015                 9
2016                 9
2017                 9
2018                 9
2019                 9
2020                 9
2021                 9
2022                 9
2023                 9
dtype: int64
Number of duplicate rows: 4
Imputed missing values with column mean for numeric columns and mode for categorical columns.
Number of rows: 24
Number of columns: 15
One-Hot Encoded Data:


  selected_rows[col] = selected_rows[col].fillna(selected_rows[col].mode()[0])


Unnamed: 0,Category_Regional Overview_AIDS-related deaths:,Category_Regional Overview_Coverage of people living with HIV receiving ART:,Category_Regional Overview_Elimination of Vertical Transmission Overview:,Category_Regional Overview_Epidemic transition metrics (Trend of AIDS-related deaths):,Category_Regional Overview_Epidemic transition metrics (Trend of new HIV infections):,Category_Regional Overview_Global,Category_Regional Overview_HIV Epidemic Metrics Overview:,Category_Regional Overview_New HIV infections:,Category_Regional Overview_People living with HIV:,Category_Regional Overview_Percent of people living with HIV who have suppressed viral loads (%):,...,Category_2023_77,Category_2023_78.33333333333333,Category_2023_86,Category_2023_318.99,Category_2023_630000,Category_2023_1300000,Category_2023_8752000,Category_2023_39900000,Category_2023_43760000,Category_2023_2023
282,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
283,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
284,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
285,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
286,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
