In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('car_price_data.csv')

In [3]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [5]:
selected_features = [
    "Make", "Model", "Year", "Engine HP", "Engine Cylinders", 
    "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"
]

df_selected = df[selected_features]

In [6]:
df_selected.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


## Data Preparation

In [7]:
df_selected.columns = df_selected.columns.str.replace(' ', '_').str.lower()

In [8]:
df_selected.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [9]:
df_selected.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [11]:
df_selected = df_selected.copy()
df_selected.fillna(0, inplace=True)

In [12]:
df_selected.rename(columns={'msrp': 'price'}, inplace=True)
df_selected.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


## Question 1

In [13]:
transmission_type_mode = df_selected['transmission_type'].mode()[0]
transmission_type_mode

'AUTOMATIC'

**Answer:** AUTOMATIC

## QUESTION 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
  
What are the two features that have the biggest correlation in this dataset?

In [15]:
# Calculate the correlation matrix for the numerical features of the dataset
correlation_matrix = df_selected.select_dtypes(include=['float64', 'int64']).corr()

In [16]:
np.fill_diagonal(correlation_matrix.values, 0)

In [17]:
# Find the index of the maximum correlation value in the correlation matrix
max_corr_index = np.unravel_index(np.argmax(correlation_matrix.values, axis=None), correlation_matrix.shape)
max_corr_features = correlation_matrix.columns[max_corr_index[0]], correlation_matrix.columns[max_corr_index[1]]
max_corr_value = correlation_matrix.iloc[max_corr_index[0], max_corr_index[1]]

correlation_matrix, max_corr_features, max_corr_value

(                      year  engine_hp  engine_cylinders  highway_mpg  \
 year              0.000000   0.338714         -0.040708     0.258240   
 engine_hp         0.338714   0.000000          0.774851    -0.415707   
 engine_cylinders -0.040708   0.774851          0.000000    -0.614541   
 highway_mpg       0.258240  -0.415707         -0.614541     0.000000   
 city_mpg          0.198171  -0.424918         -0.587306     0.886829   
 price             0.227590   0.650095          0.526274    -0.160043   
 
                   city_mpg     price  
 year              0.198171  0.227590  
 engine_hp        -0.424918  0.650095  
 engine_cylinders -0.587306  0.526274  
 highway_mpg       0.886829 -0.160043  
 city_mpg          0.000000 -0.157676  
 price            -0.157676  0.000000  ,
 ('highway_mpg', 'city_mpg'),
 0.8868294962591425)

**Answer:** `highway_mpg` and `city_mpg`

In [18]:
df_above_avg = df_selected.copy()

In [19]:
df_above_avg['above_average'] = (df_above_avg['price'] > df_above_avg['price'].mean()).astype(int)

In [20]:
df_above_avg.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


In [21]:
# Define the features (X) and the target (y)
X = df_above_avg.drop(columns=['above_average', 'price'])
y = df_above_avg['above_average']

In [24]:
# set the seed
seed = 42

# Step 1: Split the data into train_df (60%) and temp_df (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=seed)

# Step 2: Split temp_df into val_df (20%) and test_df (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)

# Display the shapes of the resulting datasets to verify the splits
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((7148, 9), (2383, 9), (2383, 9), (7148,), (2383,), (2383,))

## Question 4
Now let's train a logistic regression.

In [29]:
# Initializing one hot encoder
encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')

In [30]:
# Fit the encoder on the training data and transform both training and validation sets
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)



In [31]:
# Reinitialize the Logistic Regression model with the specified parameters
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

# Fit the model on the training dataset
model.fit(X_train_encoded, y_train)

In [32]:
# Predict the target variable for the validation dataset and calculate the accuracy
y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(accuracy, 2)
accuracy_rounded

0.95

**ANswer:** 0.95

## Question 5

In [37]:
# Initialize a dictionary to store the accuracy difference for each feature
accuracy_diffs = {}

# Iterate over each feature, exclude it, train a model, and calculate the accuracy difference
for feature in X_train.columns:
    
    # Drop the current feature from the training and validation sets
    X_train_dropped = X_train.drop(columns=[feature])
    X_val_dropped = X_val.drop(columns=[feature])
    
    # Apply one-hot encoding to the modified training and validation sets
    X_train_dropped_encoded = encoder.fit_transform(X_train_dropped)
    X_val_dropped_encoded = encoder.transform(X_val_dropped)
    
    # Train a new model with the same parameters on the modified training set
    model_dropped = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_dropped.fit(X_train_dropped_encoded, y_train)
    
    # Calculate the accuracy on the modified validation set
    y_val_dropped_pred = model_dropped.predict(X_val_dropped_encoded)
    accuracy_dropped = accuracy_score(y_val, y_val_dropped_pred)
    
    # Calculate the difference in accuracy and store it in the dictionary
    accuracy_diffs[feature] = accuracy - accuracy_dropped

# Find the feature with the smallest difference in accuracy
min_diff_feature = min(accuracy_diffs, key=accuracy_diffs.get)
min_diff_feature, accuracy_diffs


('engine_cylinders',
 {'make': 0.0008392782207301552,
  'model': 0.005035669324381042,
  'year': 0.003357112882920621,
  'engine_hp': 0.008812421317666796,
  'engine_cylinders': -0.0008392782207302663,
  'transmission_type': 0.0012589173310951773,
  'vehicle_style': 0.006714225765841353,
  'highway_mpg': 0.0020981955518254436,
  'city_mpg': 0.0004196391103650221})

**ANswer:** year

## Question 6

In [None]:
# Apply the logarithmic transformation to the original 'price' column for training and validation sets
y_train_log_original = np.log1p(df_above_avg.loc[X_train.index, 'price'])
y_val_log_original = np.log1p(df_above_avg.loc[X_val.index, 'price'])

# Initialize a dictionary to store the RMSE for each alpha
rmse_scores_original = {}

# Define the list of alpha values to try
alphas = [0, 0.01, 0.1, 1, 10]

# Iterate over each alpha, train a Ridge regression model, and calculate the RMSE on the validation set
for alpha in alphas:
    
    # Initialize the Ridge regression model with the current alpha and solver 'sag'
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    
    # Fit the model on the training data
    model.fit(X_train_encoded, y_train_log_original)
    
    # Predict the target variable for the validation dataset and calculate the RMSE
    y_val_log_pred_original = model.predict(X_val_encoded)
    rmse_original = np.sqrt(mean_squared_error(y_val_log_original, y_val_log_pred_original))
    
    # Store the RMSE in the dictionary
    rmse_scores_original[alpha] = round(rmse_original, 3)

# Find the alpha value that leads to the best RMSE on the validation set
best_alpha_original = min(rmse_scores_original, key=rmse_scores_original.get)
best_alpha_original, rmse_scores_original


**Answer:** 0