In [None]:
import pandas as pd
import seaborn
import sklearn
import sklearn.ensemble
import sklearn.compose
from sklearn.preprocessing import *
from matplotlib import pyplot as plt
import numpy as np
from sklearn.feature_selection import VarianceThreshold
import os

In [None]:
data = pd.read_csv("data/Automobile_data.csv")
display(data)
data[data["normalized-losses"] == "?"]

In [None]:
data_with_none = data.replace('?', None)

def find_meaning(df, grouping, column):
    mean = {}
    for index, row in df.iterrows():
        if row[column] is None:
            continue
        if row[grouping] not in mean:
            mean[row[grouping]] = np.array([])
        mean[row[grouping]] = np.append(mean[row[grouping]], [float(row[column])])
    for group, values in mean.items():
        mean[group] = np.mean(mean[group])
    return mean

mean_symbols = find_meaning(data_with_none, "symboling", "normalized-losses")
mean_curb = find_meaning(data_with_none, "curb-weight", "price")
display(mean_symbols)
display(mean_curb)

In [None]:

def replace_with_meaning(row):
    
    if row["normalized-losses"] is None:
        symbol = row["symboling"]
        row["normalized-losses"] = mean_symbols[symbol]
    if row["price"] is None:
        row["price"] = mean_curb[row["curb-weight"]] if row["curb-weight"] in mean_curb else None
    return row

filtered_data = data_with_none.apply(lambda row: replace_with_meaning(row), axis = 1).dropna()
filtered_data

In [None]:
changed_data = filtered_data
changed_data['symboling'] = MinMaxScaler().fit_transform(changed_data['symboling'].values.reshape((-1, 1)))
changed_data['make'] = LabelEncoder().fit_transform(changed_data['make'].values)
changed_data['fuel-type'] = LabelEncoder().fit_transform(changed_data['fuel-type'].values)
changed_data['aspiration'] = LabelEncoder().fit_transform(changed_data['aspiration'].values)
changed_data['num-of-doors'] = LabelEncoder().fit_transform(changed_data['num-of-doors'].values)
changed_data['body-style'] = LabelEncoder().fit_transform(changed_data['body-style'].values)
changed_data['drive-wheels'] = LabelEncoder().fit_transform(changed_data['drive-wheels'].values)
changed_data['engine-location'] = LabelEncoder().fit_transform(changed_data['engine-location'].values)
changed_data['engine-type'] = LabelEncoder().fit_transform(changed_data['engine-type'].values)
changed_data['fuel-system'] = LabelEncoder().fit_transform(changed_data['fuel-system'].values)
changed_data['num-of-cylinders'] = LabelEncoder().fit_transform(changed_data['num-of-cylinders'].values)
changed_data['price'] = changed_data['price'].apply(lambda x: float(x))
changed_data['normalized-losses'] = changed_data['normalized-losses'].apply(lambda x: float(x))
display(changed_data)
changed_data["engine-location"].unique()

In [None]:
cleaned_auto_data = changed_data[filtered_data.price < filtered_data.price.quantile(0.9)]
cleaned_auto_data

In [None]:

columns_array = changed_data.columns
selection = VarianceThreshold(threshold=(.8 * (1 - 0.8)))
selected_data = selection.fit_transform(cleaned_auto_data.drop(["price"], axis=1), cleaned_auto_data["price"])
selected_features = selection.get_feature_names_out(columns_array[:len(columns_array) - 1])
selected_data = pd.DataFrame(selected_data, columns=selected_features)
deleted_features = np.array([], dtype=str)
for feature in columns_array[:len(columns_array) - 1]:
    if feature not in selected_features:
        deleted_features = np.append(deleted_features, feature)

display(pd.DataFrame(selected_features))
display(pd.DataFrame(deleted_features))
display(selected_data)

In [None]:
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2, k=5)
changed_data_kbest = kbest.fit_transform(cleaned_auto_data.drop(["price"], axis=1),cleaned_auto_data["price"])
selected_features = kbest.get_feature_names_out(columns_array[:len(columns_array) - 1])
changed_data_kbest = pd.DataFrame(changed_data_kbest, columns=selected_features)
changed_data_kbest

In [None]:
tree_classifier = sklearn.ensemble.RandomForestRegressor(n_estimators=50)
tree_classifier.fit(cleaned_auto_data.drop(["price"], axis=1),cleaned_auto_data["price"])
model = sklearn.feature_selection.SelectFromModel(tree_classifier, prefit=True)
forest_selected_data = model.transform(cleaned_auto_data.drop(["price"], axis=1))
forest_selected_data

In [None]:
feature_importances = tree_classifier.feature_importances_
indices = np.argsort(feature_importances)
plt.yticks(range(len(indices)), np.array(cleaned_auto_data.drop(["price"], axis=1).columns)[indices])
plt.barh(range(len(indices)), feature_importances[indices])
plt.show()

In [None]:
correlation = cleaned_auto_data.corr()
display(correlation)
seaborn.heatmap(correlation)