## Q1: Do cars with higher prices tend to have a higher service history?

### 🔧 Data Pre-processing / Cleaning (Q1)

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

# Load dataset
df = pd.read_csv('used_car_price_dataset_extended.csv')

# Preprocessing
df['service_history'] = df['service_history'].str.strip().str.title()
df_q1 = df[['price_usd', 'service_history']].dropna()

# Bar plot of average price by service history
avg_price_by_history = df_q1.groupby('service_history')['price_usd'].mean().sort_values(ascending=False)
plt.figure(figsize=(8, 5))
sns.barplot(x=avg_price_by_history.index, y=avg_price_by_history.values)
plt.title("Average Car Price by Service History")
plt.ylabel("Average Price (USD)")
plt.xlabel("Service History")
plt.tight_layout()
plt.show()

# Box plot of price distribution
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_q1, x='service_history', y='price_usd')
plt.title("Price Distribution by Service History")
plt.ylabel("Price (USD)")
plt.xlabel("Service History")
plt.tight_layout()
plt.show()

# ANOVA test
group_full = df_q1[df_q1['service_history'] == 'Full']['price_usd']
group_partial = df_q1[df_q1['service_history'] == 'Partial']['price_usd']
group_none = df_q1[df_q1['service_history'] == 'None']['price_usd']
from scipy.stats import f_oneway
anova_result = f_oneway(group_full, group_partial, group_none)
anova_result


### 📊 Data Analysis / Statistical Study (Q1)

## Q2: How has engine capacity changed over time?

### 🚫 Modelling (Q1) - Not Applicable

### 🔧 Data Pre-processing / Cleaning (Q2)

### 🧠 Evaluation (Q1)

### 📊 Data Analysis / Statistical Study (Q2)

In [None]:

# Preprocessing
df_q2 = df[['make_year', 'engine_cc']].dropna()
df_q2['make_year'] = df_q2['make_year'].astype(int)

# Line plot of average engine cc by year
avg_engine_cc_by_year = df_q2.groupby('make_year')['engine_cc'].mean()
plt.figure(figsize=(10, 6))
sns.lineplot(x=avg_engine_cc_by_year.index, y=avg_engine_cc_by_year.values, marker='o')
plt.title("Average Engine Capacity Over Years")
plt.xlabel("Make Year")
plt.ylabel("Average Engine Capacity (cc)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Linear regression
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(avg_engine_cc_by_year.index, avg_engine_cc_by_year.values)
{
    "Slope": round(slope, 4),
    "Intercept": round(intercept, 2),
    "R-squared": round(r_value**2, 4),
    "p-value": round(p_value, 4)
}


### 🔧 Data Pre-processing / Cleaning (Q3)

### 🚫 Modelling (Q2) - Not Applicable

### 📊 Data Analysis / Statistical Study (Q3)

### 🧠 Evaluation (Q2)

### 🔧 Data Pre-processing / Cleaning (Q4)

### 🚫 Modelling (Q3) - See Bonus Section

### 📊 Data Analysis / Statistical Study (Q4)

### 🧠 Evaluation (Q3)

### 🔧 Data Pre-processing / Cleaning (Bonus Modeling)

### 🚫 Modelling (Q4) - Not Applicable

### 📊 Data Analysis / Feature Engineering (Bonus Modeling)

### 🧠 Evaluation (Q4)

### 🤖 Modelling (Bonus Modeling)

## Q3: Are certain car colors more frequently associated with accident reports?

### 🧠 Evaluation (Bonus Modeling)

In [None]:

# Preprocessing
df_q3 = df[['color', 'accidents_reported']].dropna()
df_q3['color'] = df_q3['color'].str.strip().str.title()

# Bar plot of average accidents by color
accident_by_color = df_q3.groupby('color')['accidents_reported'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=accident_by_color.values, y=accident_by_color.index)
plt.title("Average Number of Accidents Reported by Car Color")
plt.xlabel("Average Accidents Reported")
plt.ylabel("Car Color")
plt.tight_layout()
plt.show()

# Chi-square test
df_q3['accident_flag'] = df_q3['accidents_reported'].apply(lambda x: 'None' if x == 0 else 'Has Accident')
contingency_table = pd.crosstab(df_q3['color'], df_q3['accident_flag'])
from scipy.stats import chi2_contingency
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
{
    "Chi2 Statistic": round(chi2_stat, 4),
    "Degrees of Freedom": dof,
    "p-value": round(p_val, 4)
}


## Q4: Do older economy cars tend to have more owners than high-end cars?

In [None]:

# Brand categorization
economy_brands = ['Toyota', 'Honda', 'Hyundai', 'Nissan', 'Kia']
highend_brands = ['BMW', 'Mercedes-Benz', 'Audi', 'Lexus', 'Porsche']

df_q4 = df[['make_year', 'owner_count', 'brand']].dropna()
df_q4['brand'] = df_q4['brand'].str.strip().str.title()
df_q4['brand_category'] = df_q4['brand'].apply(
    lambda x: 'Economy' if x in economy_brands else ('High-End' if x in highend_brands else 'Other')
)
df_q4_filtered = df_q4[df_q4['brand_category'].isin(['Economy', 'High-End'])]

# Line plot of average owner count over years
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_q4_filtered, x='make_year', y='owner_count', hue='brand_category', estimator='mean')
plt.title("Average Owner Count by Car Year and Brand Category")
plt.xlabel("Make Year")
plt.ylabel("Average Owner Count")
plt.tight_layout()
plt.show()


## Bonus Modeling: Predicting Accidents Using Logistic Regression

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Modeling preparation
features = ['color', 'brand', 'make_year', 'transmission']
target = 'accidents_reported'

df_model = df[features + [target]].dropna()
df_model['has_accident'] = df_model['accidents_reported'].apply(lambda x: 0 if x == 0 else 1)

# One-hot encoding
categorical_cols = ['color', 'brand', 'transmission']
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_cats = encoder.fit_transform(df_model[categorical_cols])
X = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
X['make_year'] = df_model['make_year'].astype(int)
y = df_model['has_accident']

# Split and model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No Accident", "Has Accident"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Accident Prediction")
plt.tight_layout()
plt.show()
report
