In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from math import sqrt

# Read the dataset into a DataFrame
data = pd.read_csv("D:\Machine Learning Zoomcamp\Week3-Homework\data.csv")

# Select the desired features and perform data preparation
selected_features = [
    'Make',
    'Model',
    'Year',
    'Engine HP',
    'Engine Cylinders',
    'Transmission Type',
    'Vehicle Style',
    'highway MPG',
    'city mpg',
    'MSRP'
]

numerical_features = [
    'Year',
    'Engine HP',
    'Engine Cylinders',
    'highway MPG',
    'city mpg',
    'price'
]

data = data[selected_features].copy()

data.fillna(0, inplace=True)
data.rename(columns={'MSRP': 'price'}, inplace=True)

# Question 1: Most frequent observation (mode) for transmission_type
mode_transmission_type = data['Transmission Type'].mode()[0]
print("Question 1: Most frequent observation for transmission_type:", mode_transmission_type)

# Question 2: Correlation between numerical features
# Calculate the correlation matrix
correlation_matrix = data[numerical_features].corr()


# Find the two features with the biggest correlations (excluding self-correlation)
max_corr_pairs = correlation_matrix.unstack().sort_values(ascending=False)
max_corr_pairs = max_corr_pairs[max_corr_pairs < 1.0]  # Exclude self-correlation

# Get the feature names with the biggest correlations
(feature1, feature2), correlation_value = max_corr_pairs.index[0], max_corr_pairs.iloc[0]

print("Question 2: The two features with the biggest correlation are:", feature1, "and", feature2)


# Question 3: Mutual information scores for categorical variables
X = data.drop(columns=['price'])
y = (data['price'] > data['price'].mean()).astype(int)

categorical_features = ['Make', 'Model', 'Transmission Type', 'Vehicle Style']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X[categorical_features])

# Calculate the mutual information scores for categorical variables
mi_scores = mutual_info_classif(X_encoded, y, discrete_features='auto', random_state=42)

# Get the feature names for one-hot encoded variables
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Create a DataFrame to store the results
mi_scores_df = pd.DataFrame({'Feature': encoded_feature_names, 'Mutual_Info_Score': mi_scores})

# Find the variable with the lowest mutual information score
lowest_mi_variable = mi_scores_df.loc[mi_scores_df['Mutual_Info_Score'].idxmin()]
print("Question 3: Variable with the lowest mutual information score:", lowest_mi_variable['Feature'])

# Question 4: Logistic regression model
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Question 4: Accuracy on the validation dataset:", round(accuracy, 2))

# Question 5: Feature elimination
original_accuracy = accuracy
feature_differences = []

for feature in categorical_features:
    reduced_features = [f for f in categorical_features if f != feature]
    X_train_reduced = X_train[:, [i for i, f in enumerate(categorical_features) if f != feature]]
    X_val_reduced = X_val[:, [i for i, f in enumerate(categorical_features) if f != feature]]
    model.fit(X_train_reduced, y_train)
    y_pred_reduced = model.predict(X_val_reduced)
    reduced_accuracy = accuracy_score(y_val, y_pred_reduced)
    feature_differences.append((feature, original_accuracy - reduced_accuracy))

min_difference_feature = min(feature_differences, key=lambda x: x[1])
print("Question 5: Feature with the smallest difference:", min_difference_feature[0])

# Question 6: Ridge regression with logarithmic transformation
X = data.drop(columns=['price'])
y = np.log(data['price'])
X_encoded = encoder.transform(X[categorical_features])
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
alphas = [0, 0.01, 0.1, 1, 10]
best_rmse = float('inf')
best_alpha = None

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha, solver='sag', random_state=42)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, y_pred))
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

print("Question 6: Best alpha for Ridge regression:", best_alpha)




Question 1: Most frequent observation for transmission_type: AUTOMATIC
Question 2: The two features with the biggest correlation are: city mpg and highway MPG
Question 3: Variable with the lowest mutual information score: Make_Alfa Romeo
Question 4: Accuracy on the validation dataset: 0.93
Question 5: Feature with the smallest difference: Make
Question 6: Best alpha for Ridge regression: 0.1
