# TABULAR DATA PREPROCESSING

## 1. Imports and settings

Import required libraries and configure display options.


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Display settings
pd.set_option('display.max_columns', 200)
%matplotlib inline

## 2. Load dataset

Load a CSV file into a DataFrame. Change DATA_PATH to your file path.


In [None]:
df = pd.read_csv("tabular_data.csv")

## 3. Initial inspection

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

## 4. Exploratory Data Analysis (EDA)

Visualize numerical and categorical distributions, boxplots for outliers, and correlation heatmap.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns
categorical_columns = df.select_dtypes(include=['object','category','bool']).columns

### 4.1 Numerical features distribution

In [None]:
def plot_numeric_distributions(df, num_cols, bins=50):
    for col in num_cols:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col].dropna(), kde=True, bins=bins)
        plt.title(f'Distribution: {col}')
        plt.show()

In [None]:
# plot_numeric_distributions(df, numeric_columns)

### 4.2 Categorical features count

In [None]:
def plot_categorical_counts(df, cat_cols, top_n=20):
    for col in cat_cols:
        plt.figure(figsize=(6,4))
        sns.countplot(y=df[col], order=df[col].value_counts().index[:top_n])
        plt.title(f'Counts: {col}')
        plt.show()

In [None]:
# plot_categorical_counts(df, categorical_columns)

### 4.3 Correlation heatmap

In [None]:
def correlation_heatmap(df, num_cols):
    corr = df[num_cols].corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

In [None]:
# correlation_heatmap(df, numeric_columns)

### 4.4 Boxplot to check outliers

In [None]:
def plot_boxplots(df, num_cols):
    for col in num_cols:
        plt.figure(figsize=(6,2))
        sns.boxplot(x=df[col].dropna())
        plt.title(f'Boxplot: {col}')
        plt.show()

In [None]:
# plot_boxplots(df, numeric_columns)

## 5. Missing Value Handling


### 5.1 Drop Columns
Use drop_columns when you want to drop entire useless columns from the DataFrame.

In [None]:
def drop_columns(df, columns):
    """Drop specified columns from the DataFrame."""
    return df.drop(columns=columns)

In [None]:
# drop_columns(df, ['col1', 'col2'])

### 5.2 Drop Rows
Use drop_rows when the percentage of missing data is very low.

In [None]:
def drop_missing(df):
    """
    Remove rows with missing values.
    """
    return df.dropna()

In [None]:
# df = drop_missing(df)

### 5.3 Statistical Imputation
Use fill_with_statistical for numerical features when distribution is stable.
> Note: Before use **statistical imputation** split your Dataset, because of data leakage.

In [None]:
def fill_with_statistical(df, num_col, strategy="mean"):
    """Fill missing with mean, median or mode."""
    if strategy == "mean":
        df[num_col] = df[num_col].fillna(df[num_col].mean())
    elif strategy == "median":
        df[num_col] = df[num_col].fillna(df[num_col].median())
    elif strategy == "mode":
        df[num_col] = df[num_col].fillna(df[num_col].mode().iloc[0])
    return df

In [None]:
# df = fill_with_statistical(df, columns=['num_col1', 'num_col2'], method="mean")

### 5.4 Categorical Imputation
Use fill_categorical for categorical features.
> Note: Before use **categorical imputation** split your Dataset, because of data leakage.

In [None]:
def fill_categorical(df, cat_cols):
    """Fill missing categorical values with mode."""
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

In [None]:
# df = fill_categorical(df, cat_cols=['cat_col1', 'cat_col2'])

### 5.5 Forward and Backward Fill
Use interpolation for time-series or continuous numeric data.

In [None]:
def fill_with_ffill_bfill(df, columns, method="ffill"):
    """Fill using forward fill or backward fill."""
    df[columns] = df[columns].fillna(method=method)
    return df

In [None]:
# df = fill_with_ffill_bfill(df, columns=['num_col1', 'num_col2'], method="ffill")

### 5.6 Imputation Techniques
Use KNN imputation when you expect relationships between features.
> Note: Before use **KNN imputation** split your Dataset, because of data leakage.

In [None]:
from sklearn.impute import KNNImputer
def fill_with_knn(df, numeric_columns, n_neighbors=3):
    """Impute missing values using KNN."""
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    return df

In [None]:
# df = fill_with_knn(df, numeric_columns=['num_col1', 'num_col2'])

## 6. Outlier Detection & Treatment

Implement IQR-based and Z-score methods.

### 6.1 IQR Outlier Removal

In [None]:
def remove_outliers_iqr(df, num_cols, k=1.5, verbose=True):
    df = df.copy()
    for c in num_cols:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        before = len(df)
        df = df[(df[c] >= lower) & (df[c] <= upper)]
        after = len(df)
        if verbose:
            print(f"Column {c}: removed {before-after} rows using IQR (k={k})")
    return df

In [None]:
# remove_outliers_iqr(df, numeric_columns)

### 6.2 Z-Score Outlier Removal

In [None]:
import scipy.stats as stats
def remove_outliers_zscore(df, num_col, z_thresh=3.0, verbose=True):
    df = df.copy()
    z_scores = np.abs(stats.zscore(df[num_col].dropna()))
    mask = (z_scores < z_thresh).all(axis=1)
    before = len(df)
    df = df.loc[df[num_col].dropna().index[mask]]
    after = len(df)
    if verbose:
        print(f"Removed {before-after} rows by z-score threshold {z_thresh}")
    return df

In [None]:
# remove_outliers_zscore(df, numeric_columns)

## 7. Feature Engineering

Examples: ratio features, date extraction, interaction terms.


### 7.1 Ratio Feature

In [None]:
def add_ratio_feature(df, numerator, denominator, new_name=None):
    df = df.copy()
    new_name = new_name or f"{numerator}_over_{denominator}"
    df[new_name] = df[numerator] / (df[denominator].replace(0, np.nan) + 1e-9)
    return df

In [None]:
# df = add_ratio_feature(df, 'feature1', 'feature2', new_name='feature_ratio')

### 7.2 Date Extraction

In [None]:
def extract_date_parts(df, date_col):
    df = df.copy()
    dt = pd.to_datetime(df[date_col], errors='coerce')
    df[f"{date_col}_year"] = dt.dt.year
    df[f"{date_col}_month"] = dt.dt.month
    df[f"{date_col}_day"] = dt.dt.day
    df[f"{date_col}_weekday"] = dt.dt.weekday
    return df

In [None]:
# df = extract_date_parts(df, 'date_column')

## 8. Feature Selection

Selecting the most important features improves model performance and reduces overfitting.

In [None]:
# change target column
X = df.drop("target", axis=1)
y = df["target"]

### 8.1 SelectKBest (Univariate Selection)

Selects top K features based on statistical tests.

- chi2 → for non-negative features (e.g., counts, frequencies).
- f_classif → for continuous numerical features in classification problems.
- Useful for quick filtering before model training.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
def select_features_statistical(X, y, method, k):
    selector = SelectKBest(score_func=method, k=k)
    selector.fit_transform(X, y)

    selected_features = X.columns[selector.get_support()]
    print(f"Selected Top {k} Features:")
    print(selected_features)

In [None]:
select_features_statistical(X, y, method=f_classif, k=10)

### 8.2 Recursive Feature Elimination (RFE)

Iteratively trains a model and removes the least important features.

- More computationally expensive.
- Works best when you have a moderate number of features (< 100).
- Can be used with any estimator that exposes a coef_ or feature_importances_ attribute

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Define model
model = LogisticRegression(max_iter=1000)

# Apply RFE
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X, y)

selected_features_rfe = X.columns[rfe.support_]
print("Selected Features using RFE:")
print(selected_features_rfe)

### 8.3 Feature Importance (Tree-based Models)

Uses built-in feature importance scores from tree-based models (e.g., Random Forest, XGBoost).

- Works only with tree-based models
- Captures non-linear relationships.
- Provides insights into feature relationships and importance.


In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
selected_features_rf = X.columns[indices[:10]]

print("Top 10 Important Features (Random Forest):")
print(selected_features_rf)

# Plot feature importances
plt.figure(figsize=(8, 5))
plt.barh(X.columns[indices[:10]], importances[indices[:10]])
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances (Random Forest)")
plt.xlabel("Importance Score")
plt.show()


## 9. Split dataset

In [None]:
from sklearn.model_selection import train_test_split

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [None]:
print("X Train set size:", X_train.shape)
print("X Validation set size:", X_val.shape)
print("X Test set size:", X_test.shape)

print("y Train set size:", y_train.shape)
print("y Validation set size:", y_val.shape)
print("y Test set size:", y_test.shape)


## 10. Encoding Categorical Variables

### 10.1 One-Hot Encoding
Best for categorical features without ordinal relationship (e.g. color, city).

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, drop='first')
color_encoded_train = encoder.fit_transform(X_train[['color']])
color_encoded_val = encoder.transform(X_val[['color']])
color_encoded_test = encoder.transform(X_test[['color']])

# drop original categorical columns
X_train = X_train.drop(columns=['color'])
X_val = X_val.drop(columns=['color'])
X_test = X_test.drop(columns=['color'])

# concatenate the encoded features with the original dataframe
X_train = X_train.join(pd.DataFrame(color_encoded_train, columns=encoder.get_feature_names_out(['color'])))
X_val = X_val.join(pd.DataFrame(color_encoded_val, columns=encoder.get_feature_names_out(['color'])))
X_test = X_test.join(pd.DataFrame(color_encoded_test, columns=encoder.get_feature_names_out(['color'])))

### 10.2 Label Encoding
Best for binary or nominal categorical features (e.g. gender).

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train['gender_encoded'] = le.fit_transform(X_train['gender'])
X_train.drop('gender', axis=1, inplace=True)

X_val['gender_encoded'] = le.transform(X_val['gender'])
X_val.drop('gender', axis=1, inplace=True)

X_test['gender_encoded'] = le.transform(X_test['gender'])
X_test.drop('gender', axis=1, inplace=True)


### 10.3 Ordinal Encoding
Best for ordinal categorical features (e.g. education level).

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['High School','Bachelor','Master','PhD']])
X_train['education_encoded'] = encoder.fit_transform(X_train[['education']])
X_train.drop('education', axis=1, inplace=True)

X_val['education_encoded'] = encoder.transform(X_val[['education']])
X_val.drop('education', axis=1, inplace=True)

X_test['education_encoded'] = encoder.transform(X_test[['education']])
X_test.drop('education', axis=1, inplace=True)

### 10.4 Frequency Encoding

Used for categorical columns with many unique values (high cardinality).

In [None]:
freq = X_train['category'].value_counts(normalize=True)
X_train['category_freq_enc'] = X_train['category'].map(freq)
X_train = X_train.drop(columns=['category'])

X_val['category_freq_enc'] = X_val['category'].map(freq)
X_val = X_val.drop(columns=['category'])

X_test['category_freq_enc'] = X_test['category'].map(freq)
X_test = X_test.drop(columns=['category'])

## 11. Numerical Feature Scaling

Choose scaler depending on data distribution:


### 11.1 StandardScaler

Useful when features follow a **Gaussian distribution**.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

### 11.2 MinMaxScaler

Useful when features have **different scales** but known **min/max ranges**.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train["name_of_num_col"] = scaler.fit_transform(X_train["name_of_num_col"])
X_val["name_of_num_col"] = scaler.transform(X_val["name_of_num_col"])
X_test["name_of_num_col"] = scaler.transform(X_test["name_of_num_col"])

### 11.3 RobustScaler

Useful for data with **outliers**.

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train['name_of_num_col'] = scaler.fit_transform(X_train['name_of_num_col'])
X_val['name_of_num_col'] = scaler.transform(X_val['name_of_num_col'])
X_test['name_of_num_col'] = scaler.transform(X_test['name_of_num_col'])

### 11.4 Log Transformation

For **skewed data** to make it more normal.

In [None]:
import numpy as np
X_train['name_of_num_col'] = np.log1p(X_train['name_of_num_col'])
X_val['name_of_num_col'] = np.log1p(X_val['name_of_num_col'])
X_test['name_of_num_col'] = np.log1p(X_test['name_of_num_col'])

### 12. Model Training

### Common models — short description and usage

- **Logistic Regression**  
    Description: Linear model for binary (or multiclass via one-vs-rest) classification; fast, interpretable.  
    Usage:
    ```python
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)
    ```

- **Decision Tree**  
    Description: Tree-based model that captures non-linear relationships and interactions; easy to visualize, prone to overfitting.  
    Usage:
    ```python
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42, max_depth=5)
    model.fit(X_train, y_train)
    ```

- **Random Forest**  
    Description: Ensemble of decision trees (bagging); robust, less overfitting than single trees, good out-of-the-box.  
    Usage:
    ```python
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)
    ```

- **Gradient Boosting (sklearn / XGBoost / LightGBM / CatBoost)**  
    Description: Sequential tree boosting; often higher accuracy than random forest, requires tuning, handles complex patterns.  
    Usage (sklearn):
    ```python
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)
    ```
    For speed/large data prefer XGBoost/LightGBM/CatBoost APIs.

- **Support Vector Machine (SVM)**  
    Description: Effective in high-dimensional spaces, with kernel trick for non-linear decision boundaries; sensitive to feature scaling.  
    Usage:
    ```python
    from sklearn.svm import SVC
    model = SVC(kernel='rbf', C=1.0, probability=True)
    model.fit(X_train, y_train)
    ```

- **K-Nearest Neighbors (KNN)**  
    Description: Instance-based, non-parametric; simple, no training time but expensive at predict time, sensitive to scaling and noise.  
    Usage:
    ```python
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)
    ```

- **Naive Bayes (Gaussian/Bernoulli/Multinomial)**  
    Description: Probabilistic classifiers assuming feature independence; very fast and works well on text/low-sample problems.  
    Usage:
    ```python
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit(X_train, y_train)
    ```

- **Multilayer Perceptron (Neural Network)**  
    Description: Feedforward neural network for complex non-linear mappings; needs tuning and normalization, can be slow on large data.  
    Usage:
    ```python
    from sklearn.neural_network import MLPClassifier
    model = MLPClassifier(hidden_layer_sizes=(100,), random_state=42, max_iter=300)
    model.fit(X_train, y_train)
    ```

- **Linear Regression / Ridge / Lasso** (regression)  
    Description: Baseline linear models for continuous targets; Ridge/Lasso add regularization to control overfitting.  
    Usage:
    ```python
    from sklearn.linear_model import LinearRegression, Ridge, Lasso
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    ```

- **Support Vector Regressor (SVR) / KNN Regressor** (regression)  
    Description: SVR for robust non-linear regression (with kernels); KNNRegressor for simple non-parametric regression.  
    Usage (SVR):
    ```python
    from sklearn.svm import SVR
    model = SVR(kernel='rbf', C=1.0)
    model.fit(X_train, y_train)
    ```

- **K-Means (clustering)**  
    Description: Unsupervised clustering by centroid assignment; fast but assumes spherical clusters and requires k.  
    Usage:
    ```python
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=3, random_state=42).fit(X)
    labels = km.labels_
    ```

- **PCA (dimensionality reduction)**  
    Description: Linear projection to principal components for compression, noise reduction, or visualization.  
    Usage:
    ```python
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2).fit(X)
    X_reduced = pca.transform(X)
    ```

Notes: choose models based on problem type (classification/regression), data size, feature scaling, interpretability needs, and compute budget. Always validate with cross-validation and tune hyperparameters.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))
