In [7]:
import pandas as pd

# Load the data
data = pd.read_csv('car-price_data.csv')

# Prepare the data as mentioned
selected_columns = [
    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
    'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP'
]
data = data[selected_columns]

# Transform column names
data.columns = data.columns.str.replace(' ', '_').str.lower()

# Fill missing values with 0
data = data.fillna(0)

# Rename MSRP to price
data = data.rename(columns={'msrp': 'price'})

# Calculate mode of transmission_type
transmission_mode = data['transmission_type'].mode()[0]
transmission_mode



'AUTOMATIC'

In [26]:
# Create correlation matrix only for numeric features
numeric_data = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_data.corr()

# Replace the diagonal values with 0s (to exclude self-correlation)
for i in range(correlation_matrix.shape[0]):
    correlation_matrix.iloc[i, i] = 0
    

# Find the features with the biggest absolute correlation
max_corr_value = correlation_matrix.abs().max().max()
features_with_max_corr = correlation_matrix.stack().idxmax()

max_corr_value, features_with_max_corr


(0.8868294962591425, ('highway_mpg', 'city_mpg'))

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

# Create binary target variable
data['above_average'] = (data['price'] > data['price'].mean()).astype(int)

# Split the data into train, validation, and test sets
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Make sure that the target value (above_average) is not in your dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

df_train = df_train.drop(columns=['price', 'above_average'])
df_val = df_val.drop(columns=['price', 'above_average'])
df_test = df_test.drop(columns=['price', 'above_average'])

# Compute mutual information
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']
mi = df_train[categorical_features].apply(lambda x: mutual_info_score(x, y_train)).sort_values(ascending=False)

mi


model                0.462344
make                 0.239769
vehicle_style        0.084143
transmission_type    0.020958
dtype: float64

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# One-hot encode the categorical variables
X_train = pd.get_dummies(df_train, columns=categorical_features, drop_first=True)
X_val = pd.get_dummies(df_val, columns=categorical_features, drop_first=True)

# Making sure both training and validation sets have the same columns
missing_cols = set(X_train.columns) - set(X_val.columns)
for c in missing_cols:
    X_val[c] = 0
X_val = X_val[X_train.columns]

# Train a logistic regression model
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_val = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
accuracy

  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c] = 0
  X_val[c]

0.9441879983214435

In [13]:
# List of features to check
features_to_check = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

# Dictionary to store differences in accuracy for each feature
accuracy_differences = {}

# Loop over each feature, exclude it, train a model and calculate accuracy difference
for feature in features_to_check:
    # Drop the feature
    if feature in categorical_features:
        X_train_dropped = X_train.drop(columns=X_train.columns[X_train.columns.str.startswith(feature + "_")])
        X_val_dropped = X_val.drop(columns=X_val.columns[X_val.columns.str.startswith(feature + "_")])
    else:
        X_train_dropped = X_train.drop(columns=feature)
        X_val_dropped = X_val.drop(columns=feature)

    # Train a logistic regression model
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train_dropped, y_train)

    # Evaluate the model on the validation set
    y_pred_val_dropped = model.predict(X_val_dropped)
    accuracy_dropped = accuracy_score(y_val, y_pred_val_dropped)
    
    # Calculate the difference and store it
    accuracy_differences[feature] = accuracy - accuracy_dropped

accuracy_differences


{'year': -0.0025178346621905767,
 'engine_hp': 0.014687368862777994,
 'transmission_type': -0.0008392782207302663,
 'city_mpg': 0.012169534200587417}

In [21]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Extract log_price from the original full train data before dropping 'price'
df_full_train['log_price'] = np.log1p(df_full_train['price'].values)

# Split the data again
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Extract log_price as target for Ridge regression
y_train_log = df_train['log_price'].values
y_val_log = df_val['log_price'].values

# Compute Ridge regression models for different alpha values using the "sag" solver
rmse_scores = {}

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha, solver='sag', random_state=42)
    ridge_model.fit(X_train, y_train_log)
    
    y_pred_val = ridge_model.predict(X_val)
    rmse = mean_squared_error(y_val_log, y_pred_val, squared=False)
    rmse_scores[alpha] = round(rmse, 3)

# Identify the alpha with the lowest RMSE
best_alpha = min(rmse_scores, key=rmse_scores.get)
best_alpha




0