In [1]:
import pandas as pd
import numpy as np
import os

# Define the folder containing the 18 CSV files
folder_path = 'D:/7th Semi/Project/dataset/PM2.5_data_18_station'  # Replace with the actual folder path
processed_folder_path = 'D:/7th Semi/Project/dataset/Processed'  # Folder to save processed files

# Create the processed folder if it doesn't exist
os.makedirs(processed_folder_path, exist_ok=True)

# Z-score threshold
z_threshold = 3

# Process each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)

        # Load the dataset
        data = pd.read_csv(file_path)

        # Assuming the PM2.5 concentration column is labeled as 'PM2.5'
        # Replace 'PM2.5' with the actual column name if different
        column_name = 'PM2.5 (ug/m3)'
        if column_name not in data.columns:
            print(f"Column '{column_name}' not found in {file_name}, skipping.")
            continue

        # Calculate the mean and standard deviation of the PM2.5 values
        mean_pm25 = data[column_name].mean()
        std_pm25 = data[column_name].std()

        # Calculate the Z-scores for each value in the PM2.5 column
        data['Z-Score'] = (data[column_name] - mean_pm25) / std_pm25

        # Identify non-outliers (absolute Z-score <= threshold)
        non_outliers = data[np.abs(data['Z-Score']) <= z_threshold]

        # Remove the Z-Score column
        non_outliers = non_outliers.drop(columns=['Z-Score'])

        # Save the processed file with "_processed" appended to the original name
        processed_file_name = f"{os.path.splitext(file_name)[0]}.csv"
        processed_file_path = os.path.join(processed_folder_path, processed_file_name)
        non_outliers.to_csv(processed_file_path, index=False)

        print(f"Processed file saved: {processed_file_path}")


Processed file saved: D:/7th Semi/Project/dataset/Processed\Ampara.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Anuradhapura.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\BattaramullaCEA.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Batticaloa.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Fort.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Galle.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Hambanthota.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Jaffna.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Kandy.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Kanthale.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Katubedda.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Kilinochchi.csv
Processed file saved: D:/7th Semi/Project/dataset/Processed\Mannar.csv
Processed file saved: D:/7th Semi/Project/datas

In [2]:
import pandas as pd

def remove_columns(input_file, output_file, columns_to_remove):
    """
    Removes specified columns from a dataset and saves the modified dataset.
    
    :param input_file: Path to the input CSV file.
    :param output_file: Path to save the modified CSV file.
    :param columns_to_remove: List of column names to remove.
    """
    # Load the dataset
    df = pd.read_csv(input_file)
    
    # Remove specified columns
    df.drop(columns=columns_to_remove, inplace=True, errors='ignore')
    
    # Save the modified dataset
    df.to_csv(output_file, index=False)
    print(f"Modified dataset saved to {output_file}")

# Example usage
input_csv = "Ampa.csv"  # Replace with your actual file
output_csv = "Ampara.csv"
columns_to_remove = ["PM1.0 (ug/m3)", "PM4.0 (ug/m3)", "PM10.0 (ug/m3)", "PM Sensor Status", "Device Status"]  # Replace with actual column names

remove_columns(input_csv, output_csv, columns_to_remove)


Modified dataset saved to Ampara.csv


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

def remove_outliers(input_folder, output_folder, pm25_column='PM2.5 (ug/m3)', threshold=3):
    """
    Processes all CSV files in the input folder, removes outliers based on Z-Score,
    and saves the cleaned datasets in the output folder.
    
    :param input_folder: Path to the folder containing input CSV files.
    :param output_folder: Path to the folder to save cleaned CSV files.
    :param pm25_column: Name of the PM2.5 column.
    :param threshold: Z-Score threshold for identifying outliers.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file in os.listdir(input_folder):
        if file.endswith('.csv'):
            file_path = os.path.join(input_folder, file)
            df = pd.read_csv(file_path, parse_dates=['timestamp_index'], index_col='timestamp_index')
            
            # Calculate Z-Score
            df['Z-Score'] = (df[pm25_column] - df[pm25_column].mean()) / df[pm25_column].std()
            
            # Identify and remove outliers
            df_cleaned = df[~(df['Z-Score'].abs() > threshold)].drop(columns=['Z-Score'])
            
            # Save cleaned dataset
            output_file_path = os.path.join(output_folder, f'Cleaned_{file}')
            df_cleaned.to_csv(output_file_path)
            print(f"Processed and saved: {output_file_path}")

# Example usage
input_folder = "Preprocess1"  # Replace with your input folder path
output_folder = "Preprocess2"  # Replace with your output folder path
remove_outliers(input_folder, output_folder)


Processed and saved: Preprocess2\Cleaned_Ampara.csv
Processed and saved: Preprocess2\Cleaned_Anuradhapura.csv
Processed and saved: Preprocess2\Cleaned_BattaramullaCEA.csv
Processed and saved: Preprocess2\Cleaned_Batticaloa.csv
Processed and saved: Preprocess2\Cleaned_Fort.csv
Processed and saved: Preprocess2\Cleaned_Galle.csv
Processed and saved: Preprocess2\Cleaned_Hambanthota.csv
Processed and saved: Preprocess2\Cleaned_Jaffna.csv
Processed and saved: Preprocess2\Cleaned_Kandy.csv
Processed and saved: Preprocess2\Cleaned_Kanthale.csv
Processed and saved: Preprocess2\Cleaned_Katubedda.csv
Processed and saved: Preprocess2\Cleaned_Kilinochchi.csv
Processed and saved: Preprocess2\Cleaned_Mannar.csv
Processed and saved: Preprocess2\Cleaned_Matara.csv
Processed and saved: Preprocess2\Cleaned_Monaragala.csv
Processed and saved: Preprocess2\Cleaned_Mullativu.csv
Processed and saved: Preprocess2\Cleaned_PointPedro.csv
Processed and saved: Preprocess2\Cleaned_Polonnaruwa.csv
