In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('Cities1.csv', encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# 1. Data Cleaning
# Updated num_features to include only available numeric columns
num_features = ['AirQuality', 'WaterPollution']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# No categorical features available in this dataset based on the column list
cat_features = []
# cat_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# Combine preprocessing steps of Input Data
# Only include the numerical transformer as there are no categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        # Removed categorical transformer as there are no categorical features
        # ("cat", cat_transformer, cat_features)
    ],
    remainder='passthrough' # Keep other columns (like 'Potability')
)
preprocessor.set_output(transform="pandas")

# Apply the transformations to the Input data
data_preprocessed = preprocessor.fit_transform(data)

# Generate more readable column names - this might need adjustment if column names are complex after preprocessing
# For this simple case with only numerical features and passthrough, original names might be kept or slightly modified
# Let's inspect the columns after preprocessing to decide on renaming
print(data_preprocessed.head())


# 3. Data Splitting
# Assuming 'Potability' is the target variable for this dataset based on the column list
# Based on the dataset, there is no 'Potability' column.
# Assuming 'AirQuality' or 'WaterPollution' could be the target variable,
# but for demonstration, let's assume we are trying to predict one of the original columns
# like 'AirQuality' and keep the other columns as features.
# Since there is no clear target variable in this dataset for a typical classification/regression task,
# I will remove the data splitting part for now.
# If you have a specific target variable in mind from this dataset or another, please let me know.

# X = data_preprocessed.drop(columns=['remainder__Potability'])
# y = data_preprocessed['remainder__Potability']


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows
# print(X_train.head())
# print(y_train.head())

               City                Region                   Country  \
0     New York City              New York  United States of America   
1  Washington, D.C.  District of Columbia  United States of America   
2     San Francisco            California  United States of America   
3            Berlin                   NaN                   Germany   
4       Los Angeles            California  United States of America   

   AirQuality  WaterPollution  
0   46.816038       49.504950  
1   66.129032       49.107143  
2   60.514019       43.000000  
3   62.364130       28.612717  
4   36.621622       61.299435  
   num__AirQuality  num__WaterPollution   remainder__City  \
0        -0.498933             0.189768     New York City   
1         0.125258             0.174266  Washington, D.C.   
2        -0.056218            -0.063731     San Francisco   
3         0.003577            -0.624405            Berlin   
4        -0.828414             0.649401       Los Angeles   

      remainde

In [4]:
print(data.columns)

Index(['City', 'Region', 'Country', 'AirQuality', 'WaterPollution'], dtype='object')
