# Data overview and sampling

In [None]:
import pandas as pd

# Read data from CSV file
df = pd.read_csv('data/vehicles.csv')

# Get a sample of 10%
sample = df.sample(frac=0.05)

## Keep relevant column; replace Na with 'missing'

In [None]:
relevant_cols = ['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
                 'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color']
sample = sample[relevant_cols]
df = df[relevant_cols]

# # Drop rows with missing data in relevant columns
# sample.dropna(subset=['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
#                     'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color'], 
#             inplace=True)


sample = sample.fillna('missing')
# df = df.fillna('missing')

## Save a copy

In [None]:
# Write the resulting dataframe to a new CSV file

sample.to_csv('m_5p_vehicles.csv', index=False)
# df.to_csv('relcols_full_vehicles.csv', index=False)

# Exploratory Data Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('m_sample_vehicles.csv')
# df = pd.read_csv('relcols_full_vehicles.csv')

## Observing outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Price

# Observe top prices
top_prices = df['price'].sort_values(ascending=False).head(10)
print(top_prices)

# Create a density plot
sns.kdeplot(df['price'])
plt.title('Density distribution of car prices')
plt.xlabel('Price')
plt.ylabel('Density')
plt.show()

# Create a box plot with modified whiskers
plt.boxplot(df['price'], whis=[0, 99], showfliers=False)
plt.show()


In [None]:
# Year
df['year'].dtype
df['year'] = pd.to_numeric(df['year'], errors='coerce')
sns.kdeplot(df['year'], fill=True)

In [None]:
# Manufacturer
counts = df['manufacturer'].value_counts()
plt.bar(counts.index, counts.values)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Model
plt.figure(figsize=(12,8))
sns.countplot(x='model', data=df, order=df['model'].value_counts().iloc[:30].index)
plt.title('Distribution of Car Models')
plt.xlabel('Car Model')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Condition
sns.countplot(x='condition', data=df)

In [None]:
# Cylinders
sns.countplot(x='cylinders', data=df)

In [None]:
# Fuel
df['fuel'].value_counts().plot(kind='bar')
plt.title('Distribution of Fuel')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()


In [None]:
# Odometer
df['odometer'] = pd.to_numeric(df['odometer'], errors='coerce')
sns.set_style('whitegrid')
sns.kdeplot(df[df['odometer'].notnull()]['odometer'], fill=True)
plt.title('Odometer Distribution')
plt.xlabel('Odometer')
plt.ylabel('Density')
plt.show()

In [None]:
# Transmission
df['transmission'].value_counts().plot(kind='bar')
plt.title('Distribution of Transmission')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Drive
df['drive'].value_counts().plot(kind='bar')
plt.title('Distribution of Drive')
plt.xlabel('Drive Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Type
df['type'].value_counts().plot(kind='bar')
plt.title('Distribution of Type')
plt.xlabel('Vehicle Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Paint Color
df['paint_color'].value_counts().plot(kind='bar')
plt.title('Distribution of Paint Color')
plt.xlabel('Paint Color')
plt.ylabel('Count')
plt.show()

In [None]:
# save new data types


# Data Cleaning

In [None]:
# Removing price outliers

# calculate the 99th percentile value of 'price' column
price_99percentile = df['price'].quantile(0.99)

# filter the dataframe to keep only the rows where price is less than or equal to price_99percentile
df = df[df['price'] <= price_99percentile]

# Create a density plot
sns.kdeplot(df['price'])
plt.title('Density distribution of car prices')
plt.xlabel('Price')
plt.ylabel('Density')
plt.show()

In [None]:
# Year
# Find the 1st percentile value of odometer
year_1st_percentile = df['year'].quantile(0.01)

# Drop the rows with odometer values greater than the 1st percentile value
df = df[df['year'] > year_1st_percentile]


sns.kdeplot(df['year'], fill=True)

In [None]:
# Remove manufacturers with count below 500
df = df.groupby('manufacturer').filter(lambda x: len(x) >= 500)

counts = df['manufacturer'].value_counts()
plt.bar(counts.index, counts.values)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Model
plt.figure(figsize=(12,8))
sns.countplot(x='model', data=df, order=df['model'].value_counts().iloc[:30].index)
plt.title('Distribution of Car Models')
plt.xlabel('Car Model')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Condition
df = df[df['condition'] != 'missing']
df = df[df['condition'] != 'new']
df = df[df['condition'] != 'salvage']
sns.countplot(x='condition', data=df)

In [None]:
# Cylinders
df = df[df['cylinders'] != 'missing']
df = df[df['cylinders'] != 'other']

# Convert cylinders to integer
df['cylinders'] = df['cylinders'].str.split(' ', expand=True)[0].astype(int)

# Count the number of occurrences for each cylinder count
cyl_counts = df['cylinders'].value_counts()

# Get the cylinder counts with at least 1000 occurrences
valid_cyl_counts = cyl_counts[cyl_counts >= 1000].index

# Keep only rows with valid cylinder counts
df = df[df['cylinders'].isin(valid_cyl_counts)]


sns.countplot(x='cylinders', data=df)

In [None]:
# Fuel
df = df[df['fuel'] == 'gas']
df['fuel'].value_counts().plot(kind='bar')
plt.title('Distribution of Fuel')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()

In [None]:
from scipy import stats

odometer_percentile = df['odometer'].quantile(0.99)
df = df[df['odometer'] <= odometer_percentile]

# Observe top prices
top_prices = df['odometer'].sort_values(ascending=False).head(10)
print(top_prices)

# Odometer
sns.set_style('whitegrid')
sns.kdeplot(df[df['odometer'].notnull()]['odometer'], fill=True)
plt.title('Odometer Distribution')
plt.xlabel('Odometer')
plt.ylabel('Density')
plt.show()


In [None]:
# Transmission
df = df[df['transmission'] == 'automatic']
df['transmission'].value_counts().plot(kind='bar')
plt.title('Distribution of Transmission')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Drive
df['drive'].value_counts().plot(kind='bar')
plt.title('Distribution of Drive')
plt.xlabel('Drive Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Type
df = df[df['type'] != 'missing']

# Get the value counts for the 'type' column
type_counts = df['type'].value_counts()

# Get the types that appear less than 400 times
low_count_types = type_counts[type_counts < 400].index

# Keep only the rows where 'type' is not in the low_count_types list
df = df[~df['type'].isin(low_count_types)]

df['type'].value_counts().plot(kind='bar')
plt.title('Distribution of Type')
plt.xlabel('Vehicle Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Paint Color
# Get the count of each paint color
paint_color_counts = df['paint_color'].value_counts()

# Get a list of paint colors with count below 500
other_paint_colors = paint_color_counts[paint_color_counts < 500].index.tolist()

# Replace those paint colors with 'Other'
df['paint_color'] = df['paint_color'].replace(other_paint_colors, 'Other')

df['paint_color'].value_counts().plot(kind='bar')
plt.title('Distribution of Paint Color')
plt.xlabel('Paint Color')
plt.ylabel('Count')
plt.show()

# Model Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the data
# data = pd.read_csv('sample_vehicles.csv')
data = df

# Split the data into features and target variable
features = data.drop('price', axis=1)
target = data['price']

# Encode categorical features
features_encoded = pd.get_dummies(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.3, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# # Replace NaN with median
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_test_imputed = imputer.fit_transform(X_test_scaled)


# Model Performance

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Instantiate the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc.fit(X_train_imputed, y_train)

# Make predictions on the test set
y_pred = rfc.predict(X_test_imputed)

# Evaluate the model
print(classification_report(y_test, y_pred))

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    log_diff = np.log1p(y_true) - np.log1p(y_pred)
    log_diff[~np.isfinite(log_diff)] = 0
    return np.sqrt(np.mean(log_diff**2))

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_imputed, y_train)
lr_predictions = lr.predict(X_test_imputed)
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_rmsle = rmsle(y_test, lr_predictions)
print(lr_mse)
print(lr_rmsle)

## Feed forward neural network

In [None]:
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.metrics import precision_score, recall_score, f1_score

# # Build the neural network
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train_scaled.shape[1]]),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1)
# ])

# # Compile the model
# model.compile(loss='mse', optimizer='adam')

# # Train the model
# model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

# # Evaluate the performance of the model
# model.evaluate(X_test_scaled, y_test)

# # get predicted probabilities for each class
# y_pred_prob = model.predict(X_test_scaled)

# # get predicted class labels
# y_pred = np.argmax(y_pred_prob, axis=1)

# # calculate accuracy score
# accuracy = accuracy_score(y_test, y_pred)

# # generate classification report
# report = classification_report(y_test, y_pred, zero_division=1)

# precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
# recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
# f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)


# print(report)
# print("Precision:", precision)
# print("Recall:", recall)
# print("f1-score:", f1)
# print("Accuracy:", accuracy)