In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Import the dataset
data = pd.read_csv('business.csv')

# Select only numerical columns for imputation
numerical_cols = data.select_dtypes(include='number').columns


# Handling missing values using mean imputation for numerical columns
imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# Filling missing values in categorical columns with the mode
for column in data.select_dtypes(include='object').columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Checking if there are any missing values left
print("Missing values after imputation:")
print(data.isnull().sum())

# Handling outliers using z-score method
from scipy import stats

# Calculate z-scores for numerical columns
z_scores = np.abs(stats.zscore(data.select_dtypes(include='number')))

# Setting a threshold for identifying outliers (z-score > 3)
outliers = np.where(z_scores > 3)

# Remove the outliers
data_no_outliers = data[(z_scores < 3).all(axis=1)]

print(f"Data size after removing outliers: {data_no_outliers.shape}")

# Export the preprocessed dataset
data_no_outliers.to_csv('preprocessed_dataset.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)


Missing values after imputation:
Year                           0
Industry_aggregation_NZSIOC    0
Industry_code_NZSIOC           0
Industry_name_NZSIOC           0
Units                          0
Variable_code                  0
Variable_name                  0
Variable_category              0
Value                          0
Industry_code_ANZSIC06         0
dtype: int64
Data size after removing outliers: (50985, 10)


In [None]:
from google.colab import drive
drive.mount('/content/drive')