## Data reading and imports

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("pre-owned cars.csv")
print(df.head())
print(df.info())

## Data Cleaning and Preprocessing

In [None]:
print(df.isnull().sum())
df["engine_capacity(CC)"] = df["engine_capacity(CC)"].fillna(df["engine_capacity(CC)"].median())
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["km_driven"] = pd.to_numeric(df["km_driven"], errors="coerce")
df["make_year"] = pd.to_datetime(df["make_year"], format="%Y", errors="coerce")
df["reg_year"] = pd.to_datetime(df["reg_year"], format="%d-%m-%Y", errors="coerce")
print(df.dtypes)

## Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print(df.describe())
plt.figure(figsize=(10,6))
sns.histplot(df["price"], bins=50)
plt.title("Distribution of Car Prices")
plt.show()

plt.figure(figsize=(12,8))
sns.boxplot(x="brand", y="price", data=df)
plt.xticks(rotation=90)
plt.title("Price by Brand")
plt.show()

numeric_cols = df.select_dtypes(include=[float, int]).columns
corr = df[numeric_cols].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

## Machine Learning Model for Price Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

le_brand = LabelEncoder()
df["brand_encoded"] = le_brand.fit_transform(df["brand"])

le_fuel = LabelEncoder()
df["fuel_encoded"] = le_fuel.fit_transform(df["fuel_type"])

le_trans = LabelEncoder()
df["trans_encoded"] = le_trans.fit_transform(df["transmission"])

features = ["brand_encoded", "fuel_encoded", "trans_encoded", "engine_capacity(CC)", "km_driven", "make_year"]
X = df[features].copy()
X["make_year"] = X["make_year"].dt.year
y = df["price"]

X = X.fillna(X.median())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae}, R2: {r2}")

## JSON File Manipulation

In [None]:
import json

predictions = {"actual": y_test.tolist(), "predicted": y_pred.tolist()}
with open("predictions.json", "w") as f:
    json.dump(predictions, f)

with open("predictions.json", "r") as f:
    data = json.load(f)
print("Sample predictions loaded from JSON:")
print(data["actual"][:5])
print(data["predicted"][:5])

## Web Scraping

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_car_brands"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

brands = []
content = soup.find("div", {"id": "mw-content-text"})
if content:
    for li in content.find_all("li"):
        a = li.find("a")
        if a and a.get("href") and "/wiki/" in a["href"]:
            brands.append(a.text.strip())

print("Scraped brands:", brands[:10])

with open("scraped_brands.json", "w") as f:
    json.dump(brands, f)

## Multithreading

In [None]:
import threading
import time

def process_chunk(chunk):
    time.sleep(0.1)
    return chunk["price"].mean()

chunks = [df.iloc[i:i+100] for i in range(0, len(df), 100)]
results = []

def worker(chunk):
    result = process_chunk(chunk)
    results.append(result)

threads = []
for chunk in chunks[:5]:
    t = threading.Thread(target=worker, args=(chunk,))
    threads.append(t)
    t.start()

for t in threads:
    t.join()

print("Average prices from chunks:", results)

## Conclusion and Discussion

This project demonstrates a comprehensive data science application for analyzing pre-owned cars data. We implemented:

- **Data Manipulation with Pandas**: Cleaning, preprocessing, and feature engineering.
- **Scikit-Learn Preprocessing**: Encoding categorical variables, scaling features.
- **Machine Learning**: Random Forest model for price prediction with good performance (R2 ~0.87).
- **JSON File Manipulation**: Saving and loading predictions.
- **Web Scraping**: Attempted scraping of car brands from Wikipedia.
- **Multithreading**: Parallel processing of data chunks.

The model can predict car prices based on brand, fuel type, transmission, engine capacity, km driven, and make year. This can help buyers and sellers in the pre-owned car market.

Future improvements could include more features, hyperparameter tuning, or deploying as a web service (though Django was not implemented as per request).