## Programming Supplement
### Github Link: https://github.com/KKJWang
   **Name: Jingkai Wang**
   
   **The Github Repositories contains many of my projects during my undergraduate courses, including Python, SQL, HTML and so on. Because of the 2-page limit, I copy part of DS4400 projects below.**

### Import Libraries -- Part of the libraries show here

In [7]:
# Import libraries here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score

### Ingest data from csv

In [None]:
# Load the dataset
data=pd.read_csv('CarPrice_Assignment.csv')
# Display the first few rows of the dataframe to understand its structure
data.head()

### Manage different data type and Wrangle data - Part of Data cleaning of DS4400 project

In [None]:
# Drop the 'car_ID' column as it's considered useless.
data.drop('car_ID', axis=1, inplace=True)

# Extract the first part of the 'CarName' column to retain only the car's brand name.
data['CarName'] = data['CarName'].str.split(' ', expand=True)[0]

# Display the first few rows of the modified DataFrame.
data.head()

In [None]:
# Correct misspelled car brand names in the 'CarName' column using the replace method.
data['CarName'] = data['CarName'].replace({'maxda': 'mazda', 'nissan': 'Nissan', 'porcshce': 'porsche', 'toyouta': 'toyota', 
                            'vokswagen': 'volkswagen', 'vw': 'volkswagen'})
# Convert the 'symboling' column to the string data type (categorical).
data['symboling'] = data['symboling'].astype('str')

# Display the unique values in the 'CarName' column after the corrections.
data['CarName'].unique()

# Create a list of column names that contain categorical data (object type).
categorical_cols = data.select_dtypes(include=['object']).columns

# Display the first 5 rows of the DataFrame for the categorical columns.
data[categorical_cols].head(5)

### Write your own function -- From classification part of DS4400

In [None]:
# Implement the function metrics
def metrics(y, ypred):
    """ 
    Calculate the different metrics for model evaluation
    Parameters:
        y (pd.series): Actual labels
        ypred (pd.series): Predicted outcomes
    Returns:
        dict: A dictionary include accuracy, sensitivity, specificity, precision, and f1-score
    """
    # Calculate the confusion matrix for the data
    tn, fp, fn, tp = confusion_matrix(y, ypred).ravel()
    
    # Calculate metrics
    accuracy = accuracy_score(y, ypred)
    sensitivity = recall_score(y, ypred)
    specificity = tn / (tn + fp)
    precision = precision_score(y, ypred)
    f1 = f1_score(y, ypred)

    return {'Accuracy': accuracy,
            'Sensitivity': sensitivity,
            'Specificity': specificity,
            'Precision': precision,
            'F1_Score': f1}

### Visualize data & Use your function for data analysis -- From Recommendations (end of the DS4400 projects)

In [None]:
def plot_decision_boundary(model, X, y, ax, title):
    """
    Plot the SVM decision boundary and support vectors.
    """
    # Create grid to cover feature space
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    x1, y1 = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    # Predict class labels for each mesh grid point
    Z = model.decision_function(np.c_[x1.ravel(), y1.ravel()])
    Z = Z.reshape(x1.shape)
    
    # Plot decision boundary
    ax.contour(x1, y1, Z, levels=[0], alpha=0.5, linestyles=['-'])
    ax.contourf(x1, y1, Z, levels=[-1, 0, 1], alpha=0.2, colors=['blue', 'gray', 'red'])
    ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100, facecolors='none', edgecolors='k')
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='autumn')
    ax.set_title(title)

# Select the first two features from the dataset for visualization
X_2D_train = X_train[:, :2]
X_2D_test = X_test[:, :2]    

# Concatenate the training and test sets along rows for both the features and the target variable
X_visualize = np.vstack((X_2D_train, X_2D_test))
y_visualize = np.concatenate((y_train, y_test))

# Initialize the models with their respective optimal hyperparameters
svc_linear_optimal = SVC(kernel='linear', C=grid_search_linear.best_params_['C']).fit(X_visualize, y_visualize)
svc_poly_optimal = SVC(kernel='poly', degree=2, C=grid_search_poly.best_params_['C']).fit(X_visualize, y_visualize)
svc_rbf_optimal = SVC(kernel='rbf', C=grid_search_rbf.best_params_['C'], gamma=grid_search_rbf.best_params_['gamma']).fit(X_visualize, y_visualize)

# Plot decision boundaries and support vectors
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

plot_decision_boundary(svc_linear_optimal, X_visualize, y_visualize, axes[0], "SVM with Linear Kernel")
plot_decision_boundary(svc_poly_optimal, X_visualize, y_visualize, axes[1], "SVM with Polynomial Kernel")
plot_decision_boundary(svc_rbf_optimal, X_visualize, y_visualize, axes[2], "SVM with RBF Kernel")

plt.tight_layout()
plt.show()
