# Data overview and sampling

In [1]:
import pandas as pd

# Read data from CSV file
df = pd.read_csv('data/vehicles.csv')

# Get a sample of 10%
sample = df.sample(frac=0.1)

In [2]:
# # We only need to keep relevant columns, excluding data like urls, id, etc.

relevant_cols = ['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
                 'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color']
sample = sample[relevant_cols]
df = df[relevant_cols]

# # Drop rows with missing data in relevant columns
# sample.dropna(subset=['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
#                     'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color'], 
#             inplace=True)

# Replace NaN values in specified columns with 'missing'
sample = sample.fillna('missing')
df = df.fillna('missing')

In [3]:
# Write the resulting dataframe to a new CSV file
# sample.to_csv('m_sample_vehicles.csv', index=False)
df.to_csv('relcols_full_vehicles.csv', index=False)

# Exploratory Data Analysis

In [5]:
# Load the data
df = pd.read_csv('relcols_full_vehicles.csv')

## price

In [6]:
df.price.describe()

count    4.268800e+05
mean     7.519903e+04
std      1.218228e+07
min      0.000000e+00
25%      5.900000e+03
50%      1.395000e+04
75%      2.648575e+04
max      3.736929e+09
Name: price, dtype: float64

### Observing outliers

In [7]:
top_prices = df['price'].sort_values(ascending=False).head(10)
print(top_prices)

318592    3736928711
356716    3736928711
257840    3024942282
91576     3024942282
37410     3009548743
184704    1410065407
153082    1234567890
29386     1111111111
37409     1111111111
122470     987654321
Name: price, dtype: int64


### Limit price to the 99th percentile to exclude outliers

In [None]:
# calculate the 99th percentile value of 'price' column
price_99percentile = df['price'].quantile(0.99)

# filter the dataframe to keep only the rows where price is less than or equal to price_99percentile
df = df[df['price'] <= price_99percentile]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(df['price'])
plt.title('Density distribution of car prices')
plt.xlabel('Price')
plt.ylabel('Density')
plt.show()


In [None]:
# Create a box plot with modified whiskers
plt.boxplot(df['price'], whis=[0, 99], showfliers=False)

plt.show()

## year

In [None]:
['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
                 'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color']

In [None]:
df.year.describe()

## manufacturer

In [None]:
counts = df['manufacturer'].value_counts()
plt.bar(counts.index, counts.values)
plt.xticks(rotation=90)
plt.show()

## model

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x='model', data=df, order=df['model'].value_counts().iloc[:30].index)
plt.title('Distribution of Car Models')
plt.xlabel('Car Model')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

## condition

In [None]:
import seaborn as sns

sns.countplot(x='condition', data=df)

In [None]:
import seaborn as sns

sns.countplot(x='cylinders', data=df)

In [None]:
# Fuel
df['fuel'].value_counts().plot(kind='bar')
plt.title('Distribution of Fuel')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()


In [None]:
import seaborn as sns


df['odometer'] = pd.to_numeric(df['odometer'], errors='coerce
sns.set_style('whitegrid')
sns.kdeplot(df[df['odometer'].notnull()]['odometer'], fill=True)
plt.title('Odometer Distribution')
plt.xlabel('Odometer')
plt.ylabel('Density')
plt.show()

In [None]:
# Transmission
df['transmission'].value_counts().plot(kind='bar')
plt.title('Distribution of Transmission')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Drive
df['drive'].value_counts().plot(kind='bar')
plt.title('Distribution of Drive')
plt.xlabel('Drive Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Type
df['type'].value_counts().plot(kind='bar')
plt.title('Distribution of Type')
plt.xlabel('Vehicle Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Paint Color
df['paint_color'].value_counts().plot(kind='bar')
plt.title('Distribution of Paint Color')
plt.xlabel('Paint Color')
plt.ylabel('Count')
plt.show()

# Data Cleaning

In [None]:
import pandas as pd
from text_preprocessing import clean_text

# Load the training data into a Pandas DataFrame
train_data = pd.read_csv('sample_vehicles.csv')


text_cols = ['manufacturer', 'model', 'condition']
# Apply the clean_text function to the 'model' column
train_data['model_clean'] = train_data['model'].apply(lambda x: clean_text(x))

# View the cleaned 'model' column
print(train_data['model_clean'])


# Model Training

In [None]:
# # Separate numerical and categorical features
# numerical_features = [
#     'year',
#     'odometer'
# ]
# categorical_features = [
#     'manufacturer',
#     'model',
#     'condition',
#     'cylinders',
#     'fuel',
#     'transmission',
#     'drive',
#     'type',
#     'paint_color'
# ]



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('sample_vehicles.csv')

# Split the data into features and target variable
features = data.drop('price', axis=1)
target = data['price']

# Encode categorical features
features_encoded = pd.get_dummies(features)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.3, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Model Performance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Instantiate the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rfc.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_predictions = lr.predict(X_test_scaled)
lr_mse = mean_squared_error(y_test, lr_predictions)
print(lr_mse)

In [None]:
print(X_train_scaled[:5])
print(y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Build the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train_scaled.shape[1]]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(loss='mse', optimizer='adam')

# Train the model
model.fit(X_train_scaled, y_train, epochs=62, batch_size=32, validation_split=0.2)

# Evaluate the performance of the model
model.evaluate(X_test_scaled, y_test)

# get predicted probabilities for each class
y_pred_prob = model.predict(X_test_scaled)

# get predicted class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# generate classification report
report = classification_report(y_test, y_pred, zero_division=1)

precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)


print(report)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1)
print("Accuracy:", accuracy)