In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

import warnings
warnings.filterwarnings('ignore')

### Data Ingestion

In [2]:
# Reading the csv file
df = pd.read_csv('homework_data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


### Data Preperation

In [3]:
# Looking at columns name
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [4]:
# List of columns suggested for homework
columns = ['Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Size', 'highway MPG',  'city mpg', 'MSRP']

In [5]:
# Filtering suggested columns
df = df[columns]
df.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Size,highway MPG,city mpg,MSRP
0,2011,335.0,6.0,MANUAL,Compact,26,19,46135
1,2011,300.0,6.0,MANUAL,Compact,28,19,40650
2,2011,300.0,6.0,MANUAL,Compact,28,20,36350
3,2011,230.0,6.0,MANUAL,Compact,28,18,29450
4,2011,230.0,6.0,MANUAL,Compact,28,18,34500


In [6]:
# Converting column names into snake case form
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.head()

Unnamed: 0,year,engine_hp,engine_cylinders,transmission_type,vehicle_size,highway_mpg,city_mpg,msrp
0,2011,335.0,6.0,MANUAL,Compact,26,19,46135
1,2011,300.0,6.0,MANUAL,Compact,28,19,40650
2,2011,300.0,6.0,MANUAL,Compact,28,20,36350
3,2011,230.0,6.0,MANUAL,Compact,28,18,29450
4,2011,230.0,6.0,MANUAL,Compact,28,18,34500


In [7]:
# Renaming msrp to price
df = df.rename(columns={'msrp': 'price'})
df.head()

Unnamed: 0,year,engine_hp,engine_cylinders,transmission_type,vehicle_size,highway_mpg,city_mpg,price
0,2011,335.0,6.0,MANUAL,Compact,26,19,46135
1,2011,300.0,6.0,MANUAL,Compact,28,19,40650
2,2011,300.0,6.0,MANUAL,Compact,28,20,36350
3,2011,230.0,6.0,MANUAL,Compact,28,18,29450
4,2011,230.0,6.0,MANUAL,Compact,28,18,34500


In [8]:
# Checking data types of columns
df.dtypes

year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_size          object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [9]:
# Checking null values
df.isnull().sum()

year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_size          0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [10]:
# Filling null values with 0
df = df.fillna(0)
df.isnull().sum()

year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_size         0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

### Question 1:
What is the most frequent observation (mode) for the column `transmission_type`?

In [11]:
# Count of each unique value
df.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [12]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

**Observation:** `AUTOMATIC` is most frequent observation.

### Question 2:
Create the `correlation matrix` for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

In [13]:
df[['year', 'engine_cylinders']].corrwith(df.engine_hp)

year                0.338714
engine_cylinders    0.774851
dtype: float64

In [14]:
df[['city_mpg', 'engine_cylinders']].corrwith(df.highway_mpg)

city_mpg            0.886829
engine_cylinders   -0.614541
dtype: float64

**Observation:** `highway_mpg` and `city_mpg` have the biggest correlation.

### Make `price` binary
- Now we need to turn the `price` variable from numeric into a binary format.
- Let's create a variable `above_average` which is `1` if the price is above its mean value and `0` otherwise..

In [15]:
mean_price = df.price.mean()
df['above_average'] = (df['price'] > mean_price).astype(int)
df.head()

Unnamed: 0,year,engine_hp,engine_cylinders,transmission_type,vehicle_size,highway_mpg,city_mpg,price,above_average
0,2011,335.0,6.0,MANUAL,Compact,26,19,46135,1
1,2011,300.0,6.0,MANUAL,Compact,28,19,40650,1
2,2011,300.0,6.0,MANUAL,Compact,28,20,36350,0
3,2011,230.0,6.0,MANUAL,Compact,28,18,29450,0
4,2011,230.0,6.0,MANUAL,Compact,28,18,34500,0


### Split the data

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
- Make sure that the target value (`above_average`) is not in your dataframe.e.

In [16]:
# Splitting into train_full and test dataset
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Splitting into train and val dataset
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train.shape, df_val.shape, df_test.shape

((7148, 9), (2383, 9), (2383, 9))

In [17]:
# Restting the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
# Target variable
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [19]:
# Deleting target from X
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

### Question 3:
- Calculate the mutual information score between `above_average` and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

In [20]:
df_train_full = df_train_full.reset_index(drop=True)
df_train_full

Unnamed: 0,year,engine_hp,engine_cylinders,transmission_type,vehicle_size,highway_mpg,city_mpg,price,above_average
0,2016,265.0,4.0,AUTOMATIC,Large,31,22,53495,1
1,2017,449.0,8.0,AUTOMATIC,Large,18,14,93850,1
2,2016,173.0,4.0,AUTOMATIC,Compact,34,25,19890,0
3,1993,180.0,6.0,MANUAL,Large,16,11,2000,0
4,2008,172.0,6.0,AUTOMATIC,Compact,24,17,21270,0
...,...,...,...,...,...,...,...,...,...
9526,2014,181.0,4.0,AUTOMATIC,Midsize,26,20,27950,0
9527,2009,219.0,6.0,AUTOMATIC,Midsize,26,17,24710,0
9528,2016,220.0,4.0,AUTOMATED_MANUAL,Compact,33,25,27590,0
9529,2009,260.0,4.0,AUTOMATIC,Midsize,27,17,43270,1


In [21]:
df_train_full.dtypes

year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_size          object
highway_mpg            int64
city_mpg               int64
price                  int64
above_average          int32
dtype: object

In [22]:
categorical = ['transmission_type', 'vehicle_size']
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [23]:
# Function to calculate the mutual info score
def mutual_info_price_score(series):
    return mutual_info_score(series, df_train_full.above_average)

In [24]:
df_train_full[categorical].apply(mutual_info_price_score).sort_values(ascending=False)

vehicle_size         0.042072
transmission_type    0.020884
dtype: float64

**Observation:**  `transmission_type` have lowest mutual information score.

### Question 4:
- Now let's train a logistic regression.- 
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.- 
Fit the model on the training datas:
   - .
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these paramete
   - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.gits.

In [25]:
# Defining DictVectorizer
dv = DictVectorizer(sparse=False)

# Traning dataset
train_dicts = df_train[numerical + categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Validation dataset
val_dicts = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)

X_train.shape, X_val.shape

((7148, 13), (2383, 13))

In [26]:
y_train.shape, y_val.shape

((7148,), (2383,))

In [27]:
# Training the model
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42).fit(X_train, y_train)

In [28]:
# Soft prediction
y_pred = model.predict_proba(X_val)[:, 1]
above_average_prediction = (y_pred >= 0.5)
(above_average_prediction == y_val).mean()

0.8887956357532522

### Question 5: 

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.ure.

In [29]:
def train_and_get_accuracy(features):
    # Defining DictVectorizer
    dv = DictVectorizer(sparse=False)
    
    # Traning dataset
    train_dicts = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    
    # Validation dataset
    val_dicts = df_val[features].to_dict(orient='records')
    X_val = dv.fit_transform(val_dicts)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42).fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    above_average_prediction = (y_pred >= 0.5)
    return (above_average_prediction == y_val).mean()

In [30]:
list_of_features = numerical + categorical

original_accuracy = train_and_get_accuracy(list_of_features)

differences = {}


for feature in list_of_features:
    # Drop the feature of list of feature
    feature_copy = list_of_features.copy()
    feature_copy.remove(feature)
    
    accuracy_without_feature = train_and_get_accuracy(feature_copy)
    
    difference = original_accuracy - accuracy_without_feature
    differences[feature] = difference

# Find the least useful feature
min(differences)

'city_mpg'

**Observation:**: `city_mpg` is least useful feature.

### Question 6:

- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver `sag`. Set the seed to `42`.
- This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
- Round your `RMSE` scores to 3 decimal digits. ts. 

In [31]:
# Apply logaritmic transformation to price column
df_main = df.copy()
df_main['price'] = np.log1p(df_main['price'])

In [32]:
# Splitting into train_full and test dataset
df_train_full, df_test = train_test_split(df_main, test_size=0.2, random_state=42)

# Splitting into train and val dataset
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

# Restting the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Target variable
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

# Deleting target from X
df_train = df_train.drop(columns = ['price','above_average'])
df_val =  df_val.drop(columns = ['price','above_average'])
df_test = df_test.drop(columns = ['price','above_average'])

In [33]:
# Defining DictVectorizer
dv = DictVectorizer(sparse=False)

# Traning dataset
train_dicts = df_train[numerical + categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Validation dataset
val_dicts = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)

In [34]:
alphas = [0, 0.01, 0.1, 1, 10]
rmse_scores = {}

for alpha in alphas:
    ridge = Ridge(alpha=alpha, solver='sag', random_state=42)
    ridge.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = ridge.predict(X_val)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores[alpha] = (round(rmse, 3))

print(rmse_scores)

{0: 0.518, 0.01: 0.518, 0.1: 0.518, 1: 0.518, 10: 0.518}
