## Data Exploration

### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data
data = pd.read_csv("<path>", on_bad_lines="skip")

# Drop columns
data = data.drop(columns=['vin', 'saledate'])

### Infos

In [None]:
# Columns
data.columns

In [None]:
# NaN
data.isna().sum()

In [20]:
# Drop Nan
data = data.dropna()

In [None]:
# Duplicates
data.duplicated().sum()

In [22]:
numeric_columns = ['year','condition', 'odometer', 'mmr', 'sellingprice']
categorical_columns = ['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior', 'seller']

### Numerical values

In [None]:
# Correlation

corr_num = data[numeric_columns].corr()
sns.heatmap(corr_num, annot=True)
plt.show()

In [None]:
# Statistics

data[numeric_columns].describe()

In [None]:
# Distribution

n_cols = 3
n_rows = len(numeric_columns) // n_cols + int(len(numeric_columns) % n_cols > 0)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))

axes = axes.flatten()

for i, col in enumerate(numeric_columns):
    sns.kdeplot(data[col], fill=True, ax=axes[i])
    axes[i].set_title(f'{col} Distribution')

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
# Price vs Condition

plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='condition', y='sellingprice', linewidth=2.5, alpha=0.9, errorbar=None)
plt.xticks([1, 2, 3, 4, 5])
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

### Categorical Values

In [None]:
data[categorical_columns].head()

In [None]:
# Unique values

for col in categorical_columns:
    print(f"{col} values :", len(data[col].unique()))