In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error

In [2]:
raw_data = pd.read_csv('data.csv')
print(raw_data.shape)

(11914, 16)


### Data preparation

In [3]:
req_cols = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
    'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']
df = raw_data[req_cols].copy()
df.fillna(0, inplace=True)
df.rename(columns={'MSRP': 'price'}, inplace=True) 
print(df.shape)

(11914, 10)


In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

### Question 1
- What is the most frequent observation (mode) for the column transmission_type?
  - AUTOMATIC
  - MANUAL
  - AUTOMATED_MANUAL
  - DIRECT_DRIVE

In [5]:
df['transmission_type'].mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

### Question 2
- Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?
  - engine_hp and year
  - engine_hp and engine_cylinders
  - highway_mpg and engine_cylinders
  - highway_mpg and city_mpg

In [6]:
df.select_dtypes(include=np.number).corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


### train-test-split

In [7]:
avg_price = df['price'].mean()
cond = df['price'] > avg_price
df['above_average'] = 0
df.loc[cond, 'above_average'] = 1

In [8]:
X = df[[c for c in df.columns if c not in ['price', 'above_average']]].copy()
Y = df[['above_average']].copy()
print(X.shape, Y.shape)

(11914, 9) (11914, 1)


In [9]:
x_train_all, x_test, y_train_all, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(
    x_train_all, y_train_all, test_size=0.24, random_state=42)

### Question 3
- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only. Round the scores to 2 decimals using round(score, 2).
- Which of these variables has the lowest mutual information score?
  - make
  - model
  - transmission_type
  - vehicle_style

In [10]:
x_train.select_dtypes(include='object').apply(
    lambda x: round(mutual_info_score(x, y_train['above_average']), 2), axis=0)\
    .to_frame(name='MI').sort_values('MI', ascending=False)

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


### Question 4
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
        `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
  - Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
- What accuracy did you get?
  - 0.60
  - 0.72
  - 0.84
  - 0.95

In [11]:
onehot_encoder = OneHotEncoder(
    drop='first', dtype=int, handle_unknown='ignore')

In [12]:
train_encoded = onehot_encoder.fit_transform(x_train)
val_encoded = onehot_encoder.transform(x_val)



In [13]:
lr = LogisticRegression(
    solver='liblinear', C=10, max_iter=1000, random_state=42)

In [14]:
lr = lr.fit(train_encoded, np.ravel(y_train))
ypred_val = lr.predict(val_encoded)

In [15]:
acc_score = round(accuracy_score(y_pred=ypred_val, y_true=y_val), 2)
print('Accuracy score: {}'.format(acc_score))

Accuracy score: 0.95


### Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference? Note: the difference doesn't have to be positive
  - year
  - engine_hp
  - transmission_type
  - city_mpg

In [None]:
result_df = pd.DataFrame()
for c in x_train.columns:
    onehot_encoder = OneHotEncoder(
        drop='first', dtype=int, handle_unknown='ignore')
    train_encoded = onehot_encoder.fit_transform(x_train.drop(columns=[c]))
    val_encoded = onehot_encoder.transform(x_val.drop(columns=[c]))
    lr = LogisticRegression(
        solver='liblinear', C=10, max_iter=1000, random_state=42) 
    lr = lr.fit(train_encoded, y_train)
    ypred = lr.predict(val_encoded)
    score = accuracy_score(y_pred=ypred, y_true=y_val)
    if result_df.shape[0] == 0:
        res_dict = {'dropped_feature': c, 'score': score}
        result_df = pd.DataFrame(res_dict, index=[0])
    else:
        result_df = pd.concat([
            result_df, pd.DataFrame(
                {'dropped_feature': c, 'score': score}, index=[0])
        ], ignore_index=True)

In [18]:
result_df['diff'] = result_df['score'] - acc_score
result_df['abs_diff'] = result_df['diff'].apply(lambda x: abs(x))

In [19]:
result_df.sort_values('abs_diff', ascending=True)

Unnamed: 0,dropped_feature,score,diff,abs_diff
3,engine_hp,0.949738,-0.000262,0.000262
2,year,0.949301,-0.000699,0.000699
4,engine_cylinders,0.951486,0.001486,0.001486
8,city_mpg,0.951923,0.001923,0.001923
7,highway_mpg,0.94799,-0.00201,0.00201
6,vehicle_style,0.947552,-0.002448,0.002448
5,transmission_type,0.952797,0.002797,0.002797
0,make,0.953671,0.003671,0.003671
1,model,0.941871,-0.008129,0.008129


### Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.
- Which of these alphas leads to the best RMSE on the validation set? Note: If there are multiple options, select the smallest alpha.
  - 0
  - 0.01
  - 0.1
  - 1
  - 10

In [21]:
df_calc = df.copy()
df_calc['price'] = np.log1p(df_calc['price'])

In [22]:
X = df_calc[[c for c in df_calc.columns if c not in ['price', 'above_average']]].copy()
Y = df_calc[['price']].copy()
print(X.shape, Y.shape)

(11914, 9) (11914, 1)


In [23]:
x_train_all, x_test, y_train_all, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(
    x_train_all, y_train_all, test_size=0.24, random_state=42)

In [None]:
reg_ls = [0, 0.01, 0.1, 1, 10]
result_df = pd.DataFrame()
for a in reg_ls: 
    onehot_encoder = OneHotEncoder(
        drop='first', dtype=int, handle_unknown='ignore')
    train_encoded = onehot_encoder.fit_transform(x_train)
    val_encoded = onehot_encoder.transform(x_val)
    
    ridge_lr = Ridge(alpha=a, solver='sag', random_state=42)
    ridge_lr = ridge_lr.fit(train_encoded, y_train)
    ypred = np.ravel(ridge_lr.predict(val_encoded))
    rmse = np.sqrt(mean_squared_error(y_true=np.ravel(y_val), y_pred=ypred))
    result_dict = pd.DataFrame({'alpha': a, 'rmse': rmse},  index=[0])
    if result_df.shape[0] == 0:
        result_df = result_dict.copy()
    else:
        result_df = pd.concat([result_df, result_dict], ignore_index=True)

In [26]:
result_df.sort_values('rmse', ascending=True)

Unnamed: 0,alpha,rmse
1,0.01,0.118818
0,0.0,0.119123
2,0.1,0.12029
3,1.0,0.141592
4,10.0,0.228786


In [31]:
min_alpha = result_df.sort_values('rmse', ascending=True).iloc[0]['alpha']
print('Alpha value that gives the best RMSE: {}'.format(min_alpha))

Alpha value that gives the best RMSE: 0.01
