In [None]:
import pandas as pd
import plotly.express as px
from google.colab import drive
from pprint import pprint, pformat
drive.mount('/content/modules', force_remount=True)
cars = pd.read_csv("/content/modules/My Drive/cars.csv")
!pip install category_encoders

In [None]:
# Preliminary checkup
pprint(['shape', cars.shape])
pprint(['columns', cars.columns])
pprint(['dtypes', cars.dtypes])
pprint(['RequestedPrice.unique()', cars.RequestedPrice.unique()])
pprint(['MaintenancePrice.unique', cars.MaintenancePrice.unique()])
pprint(['Doors.unique()', cars.Doors.unique()])
pprint(['Capacity.unique', cars.Capacity.unique()])
pprint(['TrunkSize.unique', cars.TrunkSize.unique()])
pprint(['Safety.unique', cars.Safety.unique()])
pprint(['Buy.unique', cars.Buy.unique()])

In [None]:
# 1.
print("How many cars does this dataset have?")
print(f"{cars.shape[0]} cars total.")

In [None]:
# 2.
print("How many cars in this dataset were bought?")
print(f"{cars[cars.Buy.str.contains(r'^yes$')].shape[0]} cars were bought.")
print(f"{cars[cars.Buy.str.contains(r'^no$')].shape[0]} cars were not bought.")

In [None]:
# 3.
print("How many cars in this dataset have a very high price?")
print(f"{cars[cars.RequestedPrice.str.contains(r'^vhigh$')].shape[0]} cars have a very high price.")

In [None]:
# 4.
print("How many of the very high-priced cars were purchased?")
print(f"{cars[cars.RequestedPrice.str.contains(r'^vhigh$') & cars.Buy.str.contains(r'^yes$')].shape[0]} very high priced cars were purchased.")

In [None]:
# 5.
print("Using Plotly, visualize the relationship between car prices and whether the car is sold or not.")
px.bar(
    cars.groupby(['RequestedPrice', 'Buy']).size().reset_index(name='Count'),
    x="RequestedPrice", y="Count", color="Buy", barmode="group", title="Car Prices vs. Sale Status",
    labels={"RequestedPrice": "Price Category", "Count": "Number of Cars", "Bought": "Sold Status"}
).show()

In [None]:
# 6.
print("Using Plotly, and from the cars that were only sold, use a bar diagram to visualize the relationship between the requested price and maintenance price.")
px.bar(
    cars[cars["Buy"] == "yes"].groupby(["RequestedPrice", "MaintenancePrice"]).size().reset_index(name="Count"),
    x="RequestedPrice", y="Count",
    color="MaintenancePrice", barmode="group",
    title="Relationship Between Requested Price and Maintenance Price (Sold Cars)",
    labels={"RequestedPrice": "Requested Price", "MaintenancePrice": "Maintenance Price", "Count": "Number of Cars"}
).show()

In [None]:
# 7.
print("Using Plotly, and from the cars that were only sold, visualize the relationship between car safety and maintenance price.")
px.imshow(
    pd.crosstab(
        cars[cars.Buy == "yes"].Safety,
        cars[cars.Buy == "yes"].MaintenancePrice
    ),
    text_auto=True,
    color_continuous_scale="Blues",
    labels={
        "x": "Maintenance Price",
        "y": "Car Safety",
        "color": "Count"
    },
    title="Car Safety vs. Maintenance Price for Sold Cars (Heatmap)"
).show()

In [None]:
# 8.
print("Find any other interesting relationship in the data, and visualize it using Plotly.")
cars['Buy_num'] = cars.Buy.map({"yes": 1, "no": 0})
px.parallel_categories(
    cars,
    dimensions=["RequestedPrice", "MaintenancePrice", "Doors", "Capacity", "TrunkSize", "Safety", "Buy"],
    color="Buy_num",
    color_continuous_scale=px.colors.sequential.Inferno,
    labels={
        "RequestedPrice": "Requested Price",
        "MaintenancePrice": "Maintenance Price",
        "Doors": "Doors",
        "Capacity": "Capacity",
        "TrunkSize": "Trunk Size",
        "Safety": "Safety",
        "Buy": "Bought"
    },
    title="Relationships Across Car Attributes and Bought"
).show()

In [None]:
# 9.
print("Create two variables called X and y, where y stores all the class labels, while X stores all the class features not including the class label.")
print((X := cars.iloc[:, 0:6]).head())
print()
print((y := cars['Buy']).head())

In [None]:
# 10.
# Next, we want to convert the data in X from string to integer. We are going to use a library called category_encoders
# to do that. First, let us install this library:
from category_encoders import OrdinalEncoder
X = OrdinalEncoder().fit_transform(X)
y = OrdinalEncoder().fit_transform(y)

In [None]:
# 11.
# Divide the dataset into training and test sets. The training set should be 70% while the test set should 30%.
# Use 53 as the random state.
print("How many rows does the training set have, and how many rows does the testing set have?")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 53)
print(f"The training set has {X_train.shape[0]} ({y_train.shape[0]}) rows.")
print(f"The testing set has {X_test.shape[0]} ({y_test.shape[0]}) rows.")

In [None]:
# 12.
# Create a logistic regression classifier, then train and test your classifier.
print("What is the accuracy, precision, recall and f1-score of the model on the test data?")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
model     = LogisticRegression(max_iter=1000, random_state=53)
model.fit(X_train, y_train)
y_pred    = model.predict(X_test)
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# 13.
print("What are the true positives, true negatives, false positives, and false negatives?")
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
true_negatives, false_positives, false_negatives, true_positives = cm.ravel()
print(f"True Negatives: {true_negatives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print(f"True Positives: {true_positives}")

In [None]:
# 14.
# Although we did not learn about Support Vector Machine (SVM), let us use this classifier.
# Different from logistic regression that only uses linear equation,
# SVM supports different types of equations, including linear equation.
# These equations in SVM are called kernels.
# RBF (Radial Basis Function) is one of the kernels that is a popular in classification.
# RBF requires one parameter called gamma that we need to specify in Sklearn.
# Gamma should have a value from 0 to 1, where a higher value will cause the SVM to fit (memorize) the training dataset, which we do not want to do.
# Usually, a gamma value of 0.1 is usually used. The value of gamma needs to be manually specified in the learning algorithm.
# Create an SVM model with an RBF kernel. Then, train and test your classifier.
print("What is the accuracy, precision, recall, f1-score and confusion matrix of the model?")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
svm_model = SVC(kernel='rbf', gamma=0.1, random_state=53)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# 15.
# Similar to SVMs, we also did not learn about random forest as a classifier but let us try it out.
# Create a random forest model with 100 trees, the number of trees can be specified as a parameter in Sklearn.
# Then, train and test your classifier.
print("What is the accuracy, precision, recall, f1-score and confusion matrix of the model?")
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
rf_model = RandomForestClassifier(n_estimators=100, random_state=53)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:")
print(classification_report(y_test, y_pred))