In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('air_pollution_plant_data.csv', encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# The 'Age' column is not present in the dataset, so we remove the line attempting to convert it.
# data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# 1. Data Cleaning
# Updated num_features to include only available numeric columns
# Based on the `data` variable output, the numeric columns are 'CO', 'NO2', 'SO2', 'PM2.5', 'PM10'
num_features = ['CO', 'NO2', 'SO2', 'PM2.5', 'PM10']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Updated cat_features to include only available categorical columns
# Based on the `data` variable output, the categorical column is 'plant'
cat_features = ['plant']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps of Input Data
# Check if categorical features exist before adding the transformer
transformers = [("num", num_transformer, num_features)]
if cat_features:
    transformers.append(("cat", cat_transformer, cat_features))

preprocessor = ColumnTransformer(
    transformers=transformers
)
preprocessor.set_output(transform="pandas")

# Apply the transformations to the Input data
data_preprocessed = preprocessor.fit_transform(data)
# Generate more readable column names
data_preprocessed.columns = ['_'.join(col).replace(' ', '_') if isinstance(col, tuple) else col.replace(' ', '_') for col in data_preprocessed.columns]
print(data_preprocessed.head()) # Display head of preprocessed data

# 3. Data Splitting
# Assuming 'PM10' is the target variable for this dataset based on available columns and typical air pollution tasks
# If 'Score' was intended as the target, the dataset would need to include it.
# We will use 'PM10' as a placeholder target for demonstration.
# If a different column is the actual target, please specify.
target_column = 'num__PM10' # Using the preprocessed column name for 'PM10'
X = data_preprocessed.drop(columns=[target_column])
y = data_preprocessed[target_column]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows
print("\nX_train head:")
print(X_train.head())
print("\ny_train head:")
print(y_train.head())

         CO        NO2        SO2       PM2.5        PM10         plant
0  3.745401  39.363552  37.364082   74.950537  145.999662   Snake Plant
1  9.507143  47.343566  33.291210  112.012016   36.902399   Bamboo Palm
2  7.319939  85.454739  17.615391   84.400017   69.327939   Bamboo Palm
3  5.986585  34.000439  60.726667   12.495387  132.656127  Spider Plant
4  1.560186  86.964968  47.662416   27.837036   96.417869   Bamboo Palm
    num__CO  num__NO2  num__SO2  num__PM2.5  num__PM10  cat__plant_Areca_Palm  \
0 -0.422272 -0.342632 -0.439341   -0.027849   0.766234                    0.0   
1  1.567248 -0.063208 -0.579427    0.830719  -1.123142                    0.0   
2  0.812010  1.271273 -1.118595    0.191058  -0.561588                    0.0   
3  0.351605 -0.530424  0.364213   -1.474689   0.535147                    0.0   
4 -1.176823  1.324154 -0.085130   -1.119283  -0.092437                    0.0   

   cat__plant_Bamboo_Palm  cat__plant_Snake_Plant  cat__plant_Spider_Plant  
0   