# Data Exploration

In [None]:
# import pandas as pd

# # Read data from CSV file
# df = pd.read_csv('vehicles.csv')

# # Get a sample of 10%
# sample = df.sample(frac=0.2)

In [None]:
# print(sample.columns)

In [None]:
# # We only need to keep relevant columns, excluding data like urls, id, etc.

relevant_cols = ['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
                 'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color']
sample = sample[relevant_cols]

# Drop rows with missing data in relevant columns
sample.dropna(subset=['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 
                    'fuel', 'odometer', 'transmission', 'drive', 'type', 'paint_color'], 
            inplace=True)

In [None]:
# # # Write the resulting dataframe to a new CSV file
# sample.to_csv('20p_sample_vehicles.csv', index=False)

# Model Training

In [None]:
# # Separate numerical and categorical features
# numerical_features = [
#     'year',
#     'odometer'
# ]
# categorical_features = [
#     'manufacturer',
#     'model',
#     'condition',
#     'cylinders',
#     'fuel',
#     'transmission',
#     'drive',
#     'type',
#     'paint_color'
# ]



In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('sample_vehicles.csv')

# Split the data into features and target variable
features = data.drop('price', axis=1)
target = data['price']

# Encode categorical features
features_encoded = pd.get_dummies(features)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.3, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Model Performance

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Instantiate the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rfc.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.20      0.38      0.26       138
           1       0.48      0.58      0.53        26
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          85       0.00      0.00      0.00         0
          93       0.00      0.00      0.00         1
         100       0.20      0.50      0.29         2
         123       0.00      0.00      0.00         0
         149       0.00      0.00      0.00         1
         166       0.00      0.00      0.00         0
         170       0.00      0.00      0.00         1
         189       0.00      0.00      0.00         1
         190       0.00      0.00      0.00         1
         199       0.00      0.00      0.00         1
         200       0.00      0.00      0.00         0
         202       0.00      0.00      0.00         1
         232       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_predictions = lr.predict(X_test_scaled)
lr_mse = mean_squared_error(y_test, lr_predictions)
print(lr_mse)

4.419770249666925e+34


In [10]:
print(X_train_scaled[:5])
print(y_train)

[[ 0.76993058 -0.37636152 -0.1055378  ... -0.40871978 -0.56403335
  -0.08793795]
 [-0.57711449 -0.009587   -0.1055378  ... -0.40871978 -0.56403335
  -0.08793795]
 [ 0.04459862 -0.28427793 -0.1055378  ... -0.40871978 -0.56403335
  -0.08793795]
 [ 0.35545518  1.19626363 -0.1055378  ... -0.40871978  1.77294482
  -0.08793795]
 [ 0.66631173 -0.38590275 -0.1055378  ... -0.40871978 -0.56403335
  -0.08793795]]
10933    32000
3459     12500
11551    15590
9710     12995
2134     19900
         ...  
11964     3150
21575     5200
5390      8950
860       1800
15795        0
Name: price, Length: 16159, dtype: int64


In [13]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Build the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train_scaled.shape[1]]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(loss='mse', optimizer='adam')

# Train the model
model.fit(X_train_scaled, y_train, epochs=62, batch_size=32, validation_split=0.2)

# Evaluate the performance of the model
model.evaluate(X_test_scaled, y_test)

# get predicted probabilities for each class
y_pred_prob = model.predict(X_test_scaled)

# get predicted class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# generate classification report
report = classification_report(y_test, y_pred, zero_division=1)

precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)


print(report)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1)
print("Accuracy:", accuracy)

Epoch 1/62


2023-04-29 21:48:43.186387: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/62
Epoch 3/62
Epoch 4/62
Epoch 5/62
Epoch 6/62
Epoch 7/62
Epoch 8/62
Epoch 9/62
Epoch 10/62
Epoch 11/62
Epoch 12/62
Epoch 13/62
Epoch 14/62
Epoch 15/62
Epoch 16/62
Epoch 17/62
Epoch 18/62
Epoch 19/62
Epoch 20/62
Epoch 21/62
Epoch 22/62
Epoch 23/62
Epoch 24/62
Epoch 25/62
Epoch 26/62
Epoch 27/62
Epoch 28/62
Epoch 29/62
Epoch 30/62
Epoch 31/62
Epoch 32/62
Epoch 33/62
Epoch 34/62
Epoch 35/62
Epoch 36/62
Epoch 37/62
Epoch 38/62
Epoch 39/62
Epoch 40/62
Epoch 41/62
Epoch 42/62
Epoch 43/62
Epoch 44/62
Epoch 45/62
Epoch 46/62
Epoch 47/62
Epoch 48/62
Epoch 49/62
Epoch 50/62
Epoch 51/62
Epoch 52/62
Epoch 53/62
Epoch 54/62
Epoch 55/62
Epoch 56/62
Epoch 57/62
Epoch 58/62
Epoch 59/62
Epoch 60/62
Epoch 61/62
Epoch 62/62
              precision    recall  f1-score   support

           0       0.04      1.00      0.08       138
           1       1.00      0.00      0.00        26
           9       1.00      0.00      0.00         1
          79       1.00      0.00      0.00         1
     