In [32]:
# #####################
# # sklearn
# #####################

# 'PreProcessing'
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, KFold, StratifiedKFold, GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

# 'Models'
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler





# 'Others'
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay

# Scaling and Normalizing Features 

In [None]:
import numpy as np
# Example dataset
np.random.seed(0) # By using np.random.seed(0), the random numbers generated remain consistent, making your experiments predictable and easier to analyze.
X_train = np.random.rand(5, 3) * 10  # Training data
X_test = np.random.rand(3, 3) * 10   # Test data


# 1. Using preprocessing.scale (function)
# Standardize columns to 0 mean and 1 standard deviation
# The operation ensures that the features in your dataset have a standard normal distribution (mean = 0, standard deviation = 1).
X_scaled = preprocessing.scale(X_train)  # Standardized training data
print("Scaled Training Data:\n", X_scaled)


# 2. Using StandardScaler (class with Transformer API)
# Standardize features by removing the mean and scaling to unit variance
# It learns the mean and standard deviation from the training data (fit) and applies the same transformation to both the training and testing data (transform).
standardizer = preprocessing.StandardScaler()

# Fit the scaler on training data and transform it
X_train_standardized = standardizer.fit_transform(X_train)
X_test_standardized = standardizer.transform(X_test)

print("\nStandardized Training Data:\n", X_train_standardized)
print("\nStandardized Test Data:\n", X_test_standardized)

# 3. Using Normalizer (class with Transformer API)
# Normalize data to unit norm (l2 norm by default)
# normalizer = preprocessing.Normalizer().fit(X_train.T)  # Fit normalizer on transposed training data
normalizer = preprocessing.Normalizer()  # Fit normalizer 

# Transform training and test data
X_train_normalized = normalizer.transform(X_train.T).T  # Normalize training data
X_test_normalized = normalizer.transform(X_test.T).T    # Normalize test data



print("\nNormalized Training Data:\n", X_train_normalized)
print("\nNormalized Test Data:\n", X_test_normalized)


In [13]:
import pandas as pd

# Import clean data 
path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/module_5_auto.csv'
df = pd.read_csv(path)

In [14]:
df.to_csv('module_5_auto.csv')

 First, let's only use numeric data:


In [None]:
df=df._get_numeric_data()
df.head()

In [None]:
df.columns

<h2 id="ref1"> Training and Testing</h2>


Is a method used to split the data into training and test sets. You use the training set to train a model, discover possible predictive relationships, and then use the test set to test your model to evaluate its performance.

In [None]:
# An important step in testing your model is to split your data into training and testing data. We will place the target data price in a separate dataframe y_data
df = df.dropna()

y_data = df['price']

# Drop price data in dataframe x_data:
x_data=df.drop('price',axis=1)


# Now, we randomly split our data into training and testing data using the function train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=1)


print("number of test samples :", x_test.shape[0])
print("number of training samples:",x_train.shape[0])


#  We create a Linear Regression object:
lre=LinearRegression()

# We fit the model using the feature "horsepower"
lre.fit(x_train[['horsepower']], y_train)

# Let's calculate the R^2 on the test and training data:
lre.score(x_test[['horsepower']], y_test)
lre.score(x_train[['horsepower']], y_train)

Sometimes you do not have sufficient testing data; as a result, you may want to perform cross-validation. Let's go over several methods that you can use for cross-validation. 


## Cross-Validation
## We now have two approaches for cross-validation but they differ in implementation, workflow, and output.




| Feature                          | `KFold` (Manual)                         | `StratifiedKFold`                        | `cross_val_score` / `cross_val_predict` | `KNN Regression`                       |
|----------------------------------|------------------------------------------|------------------------------------------|-----------------------------------------|----------------------------------------|
| **Control**                      | Full control over splitting, training, and evaluation. | Preserves class distribution across folds; control over splitting. | Limited control—optimized for simplicity. | Predictions rely on \( k \)-nearest neighbors; less overall control. |
| **Ease of Use**                  | Requires more manual steps and custom logic. | Requires some manual handling like `KFold`. | Single function call for scoring or prediction. | Requires scaling and careful parameter tuning. |
| **Output**                       | Metrics (like MSE) need to be computed manually for each fold. | Metrics computed manually; suitable for classification. | Directly returns scores (e.g., R², MSE) or predictions. | Predicts using the average of neighbors’ target values. |
| **Flexibility**                  | Can include custom logic, transformations, or metrics. | Suitable for imbalanced classification problems. | Limited to the metrics and options provided by `scikit-learn`. | Supports multiple distance metrics; adaptable to non-linear relationships. |
| **Performance Prediction**       | Requires separate prediction logic.      | Requires separate prediction logic.      | `cross_val_predict` generates predictions for the entire dataset. | Provides instance-specific predictions based on nearest neighbors. |
| **Typical Use Cases**            | Advanced or custom workflows.            | Classification with imbalanced datasets. | Standard cross-validation workflows.     | Localized predictions for regression tasks, especially with non-linear data. |



<h2>KFold for Cross-Validation</h2>


In [None]:
# Initialize KFold
K = 5  # Number of folds
kf = KFold(n_splits=K, shuffle=True, random_state=42)

# Initialize model
model = LinearRegression()

# Store metrics for each fold
mse_scores = []

# K-Fold Cross-Validation
for i, (train_index, val_index) in enumerate(kf.split(x_train)):
    print(f"Fold {i+1}")
    
    # Create train and validation sets for this fold
    x_fold_train, x_fold_val = x_train.iloc[train_index], x_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the model on the training set
    model.fit(x_fold_train, y_fold_train)
    
    # Predict on the validation set
    y_val_pred = model.predict(x_fold_val)
    
    # Calculate MSE for this fold
    mse = mean_squared_error(y_fold_val, y_val_pred)
    mse_scores.append(mse)
    
    print(f"MSE for Fold {i+1}: {mse:.4f}\n")

# Report average MSE across all folds
average_mse = np.mean(mse_scores)
print(f"Average Mean Squared Error across {K} folds: {average_mse:.4f}")

# Evaluate on the hold-out test set
y_test_pred = model.predict(x_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f"Mean Squared Error on Hold-Out Test Set: {test_mse:.4f}")

<h2>StratifiedKFold</h2>

In [None]:
df['price_category'] = pd.qcut(df['price'], q=2, labels=[0, 1])  # Convert 'price' to binary classes: To make it suitable for classification, it's split into two categories (high/low price) 

# Extract features and target
x_data = df.drop(['price', 'price_category'], axis=1)
y_data = df['price_category']

# Binarize categorical features if any (not necessary for purely numerical data)
x_data = pd.get_dummies(x_data, drop_first=True)

# Initialize StratifiedKFold
K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

# Initialize model
model = LogisticRegression(max_iter=1000)

# Store accuracy for each fold
accuracy_scores = []

# Stratified K-Fold Cross-Validation
for i, (train_index, test_index) in enumerate(skf.split(x_data, y_data)):
    print(f"Fold {i+1}")
    
    # Create train and test sets for this fold
    X_train, X_test = x_data.iloc[train_index], x_data.iloc[test_index]
    Y_train, Y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    
    # Train the model on the training set
    model.fit(X_train, Y_train)
    
    # Predict on the test set
    Y_pred = model.predict(X_test)
    
    # Calculate accuracy for this fold
    accuracy = accuracy_score(Y_test, Y_pred)
    accuracy_scores.append(accuracy)
    
    print(f"Accuracy for Fold {i+1}: {accuracy:.4f}\n")

# Report average accuracy across all folds
average_accuracy = sum(accuracy_scores) / K
print(f"Average Accuracy across {K} folds: {average_accuracy:.4f}")

<h2>Cross-Validation Score</h2>
 cross-validation works by splitting the data into folds where you use some of the folds as a training set, which we use to train the model, and the remaining parts are used as a test set, which we use to test the model. You iterate through the folds until you use each partition for training and testing. At the end, you average results as the estimate of out-of-sample error.

In [None]:
# We input the object, the feature ("horsepower"), and the target data (y_data). The parameter 'cv' determines the number of folds. In this case, it is 4. 
Rcross = cross_val_score(lre, x_data[['horsepower']], y_data, cv=4)

# The default scoring is R^2. Each element in the array has the average R^2 value for the fold:
Rcross
#  We can calculate the average and standard deviation of our estimate:
print("The mean of the folds are", Rcross.mean(), "and the standard deviation is" , Rcross.std())

# We can use negative squared error as a score by setting the parameter  'scoring' metric to 'neg_mean_squared_error'. 
-1 * cross_val_score(lre,x_data[['horsepower']], y_data,cv=4,scoring='neg_mean_squared_error')

# now use 'cross_val_predict' to predict the output. The function splits up the data into the specified number of folds, with one fold for testing and the other folds are used for training. First, import the function:
yhat = cross_val_predict(lre,x_data[['horsepower']], y_data,cv=4)
yhat[0:5]

<h2>K-Nearest Neighbors (KNN)</h2>
KNN classifier is a versatile and straightforward machine learning algorithm used for both classification and regression tasks. The core idea of KNN revolves around identifying the closest data points in the feature space to make predictions.

In [None]:
X = df.drop(columns=['price', 'price_category'], errors='ignore')  # Features
y = df['price_category']  # Target (classification)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =====================
# Initialize KNN Classifier
# =====================
k = 5  # Number of neighbors
knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', metric='euclidean')

# Train the model
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# =====================
# Evaluate the Model
# =====================
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# =====================
# Cross-Validation for Performance Estimation
# =====================
cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"\nCross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")



<h2 id="ref2"> Overfitting, Underfitting and Model Selection</h2>

<p>It turns out that the test data, sometimes referred to as the "out of sample data", is a much better measure of how well your model performs in the real world.  One reason for this is overfitting.

Let's go over some examples. It turns out these differences are more apparent in Multiple Linear Regression and Polynomial Regression so we will explore overfitting in that context.</p>


In [None]:
# Let's create Multiple Linear Regression objects and train the model using 'horsepower', 'curb-weight', 'engine-size' and 'highway-mpg' as features.

lr = LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train)

# Prediction using training data:
yhat_train = lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
yhat_train[0:5]

# Prediction using test data: 
yhat_test = lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
yhat_test[0:5]

Let's perform some model evaluation using our training and testing data separately. 


In [33]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))

    ax1 = sns.kdeplot(RedFunction, color="r", label=RedName)
    ax2 = sns.kdeplot(BlueFunction, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')
    plt.show()
    plt.close()


Title = 'Distribution  Plot of  Predicted Value Using Training Data vs Training Data Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", Title)

The fig predicted values using the training data compared to the actual values of the training data. Where, the model seems to be doing well in learning from the training dataset. But what happens when the model encounters new data from the testing dataset? When the model generates new values from the test data, we see the distribution of the predicted values is much different from the actual target values. 


In [None]:
Title='Distribution  Plot of  Predicted Value Using Test Data vs Data Distribution of Test Data'
DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values (Test)",Title)

<p>Comparing Figure 1 and Figure 2, it is evident that the distribution of the test data in Figure 1 is much better at fitting the data. This difference in Figure 2 is apparent in the range of 5000 to 15,000. This is where the shape of the distribution is extremely different. Let's see if polynomial regression also exhibits a drop in the prediction accuracy when analysing the test dataset.</p>


## Overfitting

Overfitting occurs when the model fits the noise, but not the underlying process. Therefore, when testing your model using the test set, your model does not perform as well since it is modelling noise, not the underlying process that generated the relationship. Let's create a degree 5 polynomial model.


In [None]:
# Let's use 55 percent of the data for training and the rest for testing:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=0)

# We will perform a degree 5 polynomial transformation on the feature 'horsepower'
pr = PolynomialFeatures(degree=5)
x_train_pr = pr.fit_transform(x_train[['horsepower']])
x_test_pr = pr.fit_transform(x_test[['horsepower']])
pr

# Now, let's create a Linear Regression model "poly" and train it.
poly = LinearRegression()
poly.fit(x_train_pr, y_train)

# We can see the output of our model using the method "predict." We assign the values to "yhat".
yhat = poly.predict(x_test_pr)
yhat[0:5]

# Let's take the first five predicted values and compare it to the actual targets. 
print("Predicted values:", yhat[0:4])
print("True values:", y_test[0:4].values)

We will use the function "PollyPlot" that we defined at the beginning of the lab to display the training data, testing data, and the predicted function.


In [42]:
def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    
    #training data 
    #testing data 
    # lr:  linear regression object 
    #poly_transform:  polynomial transformation object 
 
    xmax=max([xtrain.values.max(), xtest.values.max()])

    xmin=min([xtrain.values.min(), xtest.values.min()])

    x=np.arange(xmin, xmax, 0.1)


    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
    plt.ylim([-10000, 60000])
    plt.ylabel('Price')
    plt.legend()

In [None]:
PollyPlot(x_train['horsepower'], x_test['horsepower'], y_train, y_test, poly,pr)

Figure 3: A polynomial regression model where red dots represent training data, green dots represent test data, and the blue line represents the model prediction. 


We see that the estimated function appears to track the data but around 200 horsepower, the function begins to diverge from the data points. 


In [None]:
#  R^2 of the training data:
poly.score(x_train_pr, y_train)


#  R^2 of the test data:
poly.score(x_test_pr, y_test)

We see the R^2 for the training data is 0.5567 while the R^2 on the test data was -29.87.  The lower the R^2, the worse the model. A negative R^2 is a sign of overfitting.


Let's see how the R^2 changes on the test data for different order polynomials and then plot the results:


In [None]:
Rsqu_test = []

order = [1, 2, 3, 4]
for n in order:
    pr = PolynomialFeatures(degree=n)
    
    x_train_pr = pr.fit_transform(x_train[['horsepower']])
    
    x_test_pr = pr.fit_transform(x_test[['horsepower']])    
    
    lr.fit(x_train_pr, y_train)
    
    Rsqu_test.append(lr.score(x_test_pr, y_test))

plt.plot(order, Rsqu_test)
plt.xlabel('order')
plt.ylabel('R^2')
plt.title('R^2 Using Test Data')
plt.text(3, 0.75, 'Maximum R^2 ')    



We see the R^2 gradually increases until an order three polynomial is used. Then, the R^2 dramatically decreases at an order four polynomial.


## Ridge Regression
You should use ridge regression when there is a strong relationship among the independent variables.  
Ridge regression prevents overfitting.
Ridge regression controls the magnitude of polynomial coefficients by introducing a hyperparameter, alpha. 


To determine alpha, you divide your data into training  and validation data. Starting with a small value for alpha, you train the model, make a prediction using the validation data, then calculate the R-squared and store the values. You repeat the value for a larger value of alpha. You repeat the process for different alpha values, training the model, and making a prediction. You select the value of alpha that maximizes R-squared.

The highest alpha value is usually the model with the most underfitting. 
OR
The lowest alpha value is usually the model with the most overfitting.

 Let's perform a degree two polynomial transformation on our data. 


In [None]:
pr=PolynomialFeatures(degree=2)
x_train_pr=pr.fit_transform(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])
x_test_pr=pr.fit_transform(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])

Let's create a Ridge regression object, setting the regularization parameter (alpha) to 1 


In [48]:
RigeModel=Ridge(alpha=1)

Like regular regression, you can fit the model using the method <b>fit</b>.


In [None]:
RigeModel.fit(x_train_pr, y_train)

 Similarly, you can obtain a prediction: 


In [50]:
yhat = RigeModel.predict(x_test_pr)

Let's compare the first four predicted samples to our test set: 


In [None]:
print('predicted:', yhat[0:4])
print('test set :', y_test[0:4].values)

We select the value of alpha that minimizes the test error. To do so, we can use a for loop. We have also created a progress bar to see how many iterations we have completed so far.


In [None]:
from tqdm import tqdm

Rsqu_test = []
Rsqu_train = []
dummy1 = []
Alpha = 10 * np.array(range(0,1000))
pbar = tqdm(Alpha)

for alpha in pbar:
    RigeModel = Ridge(alpha=alpha) 
    RigeModel.fit(x_train_pr, y_train)
    test_score, train_score = RigeModel.score(x_test_pr, y_test), RigeModel.score(x_train_pr, y_train)
    
    pbar.set_postfix({"Test Score": test_score, "Train Score": train_score})

    Rsqu_test.append(test_score)
    Rsqu_train.append(train_score)

We can plot out the value of R^2 for different alphas: 


In [None]:
width = 12
height = 10
plt.figure(figsize=(width, height))

plt.plot(Alpha,Rsqu_test, label='validation data  ')
plt.plot(Alpha,Rsqu_train, 'r', label='training Data ')
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.legend()

**Figure 4**: The blue line represents the R^2 of the validation data, and the red line represents the R^2 of the training data. The x-axis represents the different values of Alpha. 


Here the model is built and tested on the same data, so the training and test data are the same.

The red line in Figure 4 represents the R^2 of the training data. As alpha increases the R^2 decreases. Therefore, as alpha increases, the model performs worse on the training data

The blue line represents the R^2 on the validation data. As the value for alpha increases, the R^2 increases and converges at a point.


<h2 id="ref4"> Grid Search</h2>

## method1: Ridge Regression 


In [None]:
# The term alpha is a hyperparameter. Sklearn has the class GridSearchCV to make the process of finding the best hyperparameter simpler.
# We create a dictionary of parameter values:
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000]}]

# Create a Ridge regression object:
RR=Ridge()

# Create a ridge grid search object:
Grid1 = GridSearchCV(RR, parameters1,cv=4)

# Fit the model:
Grid1.fit(x_data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_data)

# The object finds the best parameter values on the validation data. We can obtain the estimator with the best parameters and assign it to the variable BestRR as follows:
BestRR=Grid1.best_estimator_

# We now test our model on the test data:
BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_test)


# Perform a grid search for the alpha parameter and the normalization parameter, then find the best values of the parameters:
best_alpha = Grid1.best_params_['alpha']
best_ridge_model = Ridge(alpha=best_alpha)
best_ridge_model.fit(x_data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_data)

## method2: Decision Tree 

In [None]:
# param_grid = {'min_samples_leaf': range(1,50)} #Tuning min_samples_leaf for controlling tree growth.
# cv_grid = GridSearchCV(estimator = dtree, param_grid = param_grid, cv = 5, verbose=2, n_jobs=-1)
# """
# grid search model

# cv_grid.fit(X, y)
# cv_grid.best_estimator_
# cv_grid.cv_results_
# cv_grid.cv_results_['mean_test_score']
# cv_grid.cv_results_['std_test_score']

### **Comparison of LDA and PCA**:

| **Feature**                   | **Linear Discriminant Analysis (LDA)** | **Principal Component Analysis (PCA)** |
|-------------------------------|----------------------------------------|----------------------------------------|
| **Purpose**                    | Supervised dimensionality reduction for classification | Unsupervised dimensionality reduction for capturing variance |
| **Input Data**                 | Requires class labels (supervised)     | No need for class labels (unsupervised) |
| **Assumptions**                | Assumes normal distribution and equal covariance for each class | Assumes no specific distribution, focuses on variance |
| **Goal**                        | Maximizes class separability by finding linear combinations of features | Maximizes the variance of data without considering class labels |
| **Dimensionality Reduction**   | Reduces dimensions while preserving class separability | Reduces dimensions by finding directions with the highest variance |
| **Use Case**                   | Classification tasks, especially when classes are well-separated | Feature reduction, noise removal, and visualization |
| **Output**                     | Projects data into a lower-dimensional space for classification | Projects data into a new feature space based on variance |
| **Interpretability**           | The components have direct class-related meanings | Components are based on variance and may not have direct class interpretations |
| **Data Requirement**           | Requires labeled data for training | Works with unlabeled data |
| **Performance**                | Better performance for classification tasks with well-separated classes | Better for exploratory data analysis and when labels are not available |



### **Example for Linear Discriminant Analysis (LDA)**:
LDA is used when you have labeled data and want to reduce dimensionality while keeping the class separability intact.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


X = df_clean.drop(['symboling'], axis=1)  # Features
y = df_clean['symboling']  # Target variable: 'symboling' for classification

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Linear Discriminant Analysis (LDA) for dimensionality reduction
lda = LinearDiscriminantAnalysis(n_components=2)  # We use 2 components for 2D visualization
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

# Create a DataFrame for visualization
tmp_Df = pd.DataFrame(X_train_lda, columns=['LDA Component 1', 'LDA Component 2'])  # Two components
tmp_Df['Class'] = y_train  # Add the target variable (symboling) to the DataFrame

# Visualize the LDA components with Seaborn's FacetGrid
sns.set(style="white", palette="muted")
g = sns.FacetGrid(tmp_Df, hue="Class", height=6)
g.map(plt.scatter, 'LDA Component 1', 'LDA Component 2', edgecolor="w", s=100)  # Plot in 2D
g.add_legend()

# Set plot titles and labels
plt.title('LDA: Training Data in 2D', fontsize=16)
plt.xlabel('LDA Component 1')
plt.ylabel('LDA Component 2')

# Show the plot
plt.show()


### **Example for Principal Component Analysis (PCA)**:

PCA is used for dimensionality reduction when you do not have class labels and simply want to reduce the number of features based on variance.


In [None]:

from sklearn.decomposition import PCA

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA for dimensionality reduction (2 components for 2D visualization)
pca = PCA(n_components=2)  # Reduce to 2 components for 2D visualization
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create a DataFrame for visualization
tmp_Df = pd.DataFrame(X_train_pca, columns=['PCA Component 1', 'PCA Component 2'])
tmp_Df['Class'] = y_train  # Add the target variable (symboling) to the DataFrame

# Visualize the PCA components with Seaborn's FacetGrid
sns.set(style="white", palette="muted")
g = sns.FacetGrid(tmp_Df, hue="Class", height=6)
g.map(plt.scatter, 'PCA Component 1', 'PCA Component 2', edgecolor="w", s=100)  # Plot in 2D
g.add_legend()

# Set plot titles and labels
plt.title('PCA: Training Data in 2D', fontsize=16)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

# Show the plot
plt.show()


### TO DO

In [None]:
# reg = linear_model.Lars(n_nonzero_coefs=j, fit_path = False, fit_intercept = False, verbose = True)
# """
# lars

# reg.fit(Xtrain,ytrain)
# beta = reg.coef_.ravel()
# """

# with warnings.catch_warnings(): # done to disable all the convergence warnings from elastic net
#     warnings.simplefilter("ignore")
#     model = linear_model.ElasticNetCV(cv=5, l1_ratio = alpha, alphas=lambdas, normalize=True).fit(X, y)

# """
# elastic

# model.alphas_
# model.mse_path_.mean(axis=-1)
# """

# model = LogisticRegression(penalty = 'l1', C = 1/lambda_, solver='liblinear')
# model = model.fit(X_train, y_train)
# """
# LogisticRegression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# .fit(X_train, y_train)
# .predict(X_test)
# .coef_
# """

# dtree = DecisionTreeRegressor()
# dtree = DecisionTreeClassifier()
# dtree = DecisionTreeClassifier(min_samples_leaf=min_sample_leaf_opt)
# dtree=DecisionTreeClassifier(ccp_alpha=0.02040816326530612, criterion='gini')
# """ 
# create a decisiontreeregressor/classifier 
# See week 05: 
# for how to tune the parameter, MinLeaf value, using cross validation.
# or to find the tree size through cost complexity pruning of the best estimator

# dtree.fit(x, y)
# """

# bagging = BaggingClassifier(DecisionTreeClassifier(), bootstrap=True, oob_score = True)
# """
# bagged trees
# """

# from sklearn.tree import plot_tree
# plot_tree(dtree,feature_names = feature_names,filled = True)
# """
# to plot tree

# A little description of the information at each plotted node
# 1. row: The condition
# 2. row: The impurity score of the node
# 3. row: The number of observations at this node
# 4. row: The number of samples for each class at this node
# 5. row: The class by majority voting
# """

# sk.metrics.log_loss
# """
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html
# """


# clf = RandomForestClassifier(bootstrap=True, oob_score=True, criterion = 'gini',random_state=0)
# """
# """