#  Step 5: Data Wrangling – Bosch QA Failure Prediction


## 1. Load and Preview Data
Load the numeric, categorical, and date files; preview with `.head()` and `.info()`

In [None]:
import pandas as pd

numeric = pd.read_csv('train_numeric.csv', nrows=100000)
categorical = pd.read_csv('train_categorical.csv', nrows=100000)
date = pd.read_csv('train_date.csv', nrows=100000)

numeric.head()

## 2. Data Cleaning
Apply cleaning steps to prepare the dataset.

### 2.1 Drop Constant Columns

In [None]:
nunique = numeric.nunique()
const_cols = nunique[nunique == 1].index.tolist()
numeric.drop(columns=const_cols, inplace=True)

### 2.2 Drop High-Missing Columns

In [None]:
missing_pct = numeric.isnull().mean()
high_missing = missing_pct[missing_pct > 0.9].index
numeric.drop(columns=high_missing, inplace=True)

### 2.3 Fill Missing Values

In [None]:
numeric.fillna(numeric.median(), inplace=True)

## 3. Outlier Detection (Optional)
Detect and handle outliers here if applicable to your project.

## 4. Subset Data for Prototyping

In [None]:
sample = numeric.sample(n=100000, random_state=42)

## 5. Export Cleaned Subset

In [None]:
sample.to_csv('bosch_clean_subset.csv', index=False)

# Optional: Step 6 – Exploratory Data Analysis (EDA)

## 6. Feature Correlation with Response

In [None]:
from scipy.stats import pointbiserialr

correlations = {}
for col in numeric.columns:
    if col not in ['Id', 'Response']:
        try:
            corr, _ = pointbiserialr(numeric[col], numeric['Response'])
            correlations[col] = corr
        except:
            continue

top_features = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
top_features

## 7. Visualize Top Features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

top_col = top_features[0][0]
sns.histplot(data=numeric, x=top_col, hue="Response", bins=50)
plt.title(f'Distribution of {top_col} by QA Response')
plt.show()