# Assignment 2 - B

In [None]:
%pip install numpy pandas matplotlib

## Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Loading Dataset and Preprocessing

In [None]:
df = pd.read_csv('Hitters.csv')
print(df.shape)
df.head()

### 59 samples don't have salary

In [None]:
print(df.isna().sum())
df = df.dropna()
df = df.reset_index(drop=True)

### Convert Strings to Categorical

In [None]:
df.head()

In [None]:
df['League'].unique()

In [None]:
df['Division'].unique()

In [None]:
df['NewLeague'].unique()

In [None]:
def map_columns(column):
    unique_values = df[column].unique()
    mapping = {value: index + 1 for index, value in enumerate(unique_values)}
    df[column] = df[column].map(mapping)
    print(f"Mapping for {column}: {mapping}")

columns_to_map = ['League', 'Division', 'NewLeague']
for column in columns_to_map:
    map_columns(column)

In [None]:
df.head()

### EDA

In [None]:
samples, features = np.shape(df)
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.corr()

## 2. PCA Analysis

In [None]:
X = df.drop(columns=['Salary'])
y = df['Salary']

In [None]:
# Standardize the features
X_standardized = (X - X.mean()) / X.std()

df_pca = X_standardized
df_pca['Salary'] = y

In [None]:
# Calculate the covariance matrix
covariance_matrix = np.cov(X_standardized, rowvar=False)

# Calculate eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

# Sort eigenvalues and corresponding eigenvectors in descending order
eig_pairs = [(eigenvalues[i], eigenvectors[:, i]) for i in range(len(eigenvalues))]
eig_pairs.sort(key=lambda x: x[0], reverse=True)

In [None]:
# Determine the number of components for efficient prediction
total_variance = sum(eigenvalues)
explained_variance = [eigenvalue / total_variance for eigenvalue in eigenvalues]
cumulative_explained_variance = np.cumsum(explained_variance)

# Find the number of components that explain at least 90% of the variance (only for representation purposes)
desired_explained_variance = 0.90
num_components = np.argmax(cumulative_explained_variance >= desired_explained_variance) + 1
num_components

In [None]:
# Plot the explained variance to visualize the relationship
plt.plot(range(1, len(eigenvalues) + 1), cumulative_explained_variance, marker='o')
plt.axvline(x=num_components, color='r', linestyle='--', label=f'{desired_explained_variance * 100}% Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Number of Components vs Cumulative Explained Variance')
plt.savefig('2B1.png')
plt.show()

## Generate Random Training and Test Sets

In [None]:
seed = 420
train_fraction = 0.8
train = df_pca.sample(frac=train_fraction, random_state=seed)
test = df_pca.drop(train.index)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# Assuming 'Salary' is the column you want to predict
X_train = train.drop('Salary', axis=1)  # Features for training
y_train = train['Salary']  # Target for training

X_test = test.drop('Salary', axis=1)  # Features for testing
y_test = test['Salary']  # Target for testing

# Convert labels to numpy array for applying ML Models
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

## 3. Model Training and RMSE

In [None]:
def fit_linear_regression(x, y, lr=0.00001, e=0.9):
    # Insert new column with ones (bias)
    regression = np.c_[x, np.ones(len(x))]
    # Weights with same width as x
    weights = np.ones(regression.shape[1])
    # Gradient Descent
    norma = 1
    while(norma > e):
        y_pred = regression @ weights
        partial = regression.T @ (y-y_pred)
        norma = np.sum(np.sqrt(np.square(partial)))

        weights = weights.T + (lr*partial)

        if np.isnan(norma):
            print('MODEL DIVERGED! USE LOWER LEARNING RATE!')
        
    return weights

def predict(w, x):
    return w[:-1] @ np.array(x).T + w[-1]

def MSE(y, y_pred):
    return np.sum(np.square(y - y_pred))/float(len(y))

def MAE(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [None]:
# Define a function to fit a linear regression model and calculate MSE
def fit_and_evaluate_pca_regression(X_train, y_train, X_test, y_test, num_components):
    # Project the original data onto the selected number of components
    selected_eigenvectors = eigenvectors[:, :num_components]
    X_train_pca = np.dot(X_train, selected_eigenvectors)
    X_test_pca = np.dot(X_test, selected_eigenvectors)

    # Fit linear regression using gradient descent
    weights = fit_linear_regression(X_train_pca, y_train)

    # Make predictions on the test set
    y_pred = predict(weights, X_test_pca)

    # Calculate MSE
    rmse = np.sqrt(MSE(y_test, y_pred))
    
    return rmse

In [None]:
# Try a range of principal component numbers
component_numbers = range(1, len(eigenvalues) + 1)

# Store MSE values for each number of components
rmse_values = []

# Iterate over component numbers
for num_components in component_numbers:
    rmse = fit_and_evaluate_pca_regression(X_train.to_numpy(), y_train, X_test.to_numpy(), y_test, num_components)
    rmse_values.append(rmse)

## 4. Plotting Number of Components vs RMSE

In [None]:
# Plot the RMSE values for different numbers of components
plt.plot(component_numbers, rmse_values, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.title('Number of Components vs RMSE')

# Identify the Stable RMSE and plot a line
min_rmse_index = 6
min_rmse = rmse_values[min_rmse_index]
plt.axvline(x=min_rmse_index + 1, color='r', linestyle='--', label=f'Stable RMSE: {min_rmse:.2f} (at {min_rmse_index + 1} components)')

plt.legend()
plt.savefig('2B2.png')
plt.show()

## 5. Testing the Most Efficient Model

In [None]:
optimal_num_components = min_rmse_index + 1

# Project the original data onto the selected optimal number of components
selected_eigenvectors = eigenvectors[:, :optimal_num_components]
X_train_optimal_pca = np.dot(X_train.to_numpy(), selected_eigenvectors)
X_test_optimal_pca = np.dot(X_test.to_numpy(), selected_eigenvectors)

# Fit linear regression using gradient descent
weights_optimal = fit_linear_regression(X_train_optimal_pca, y_train)

# Choose a specific point for prediction
specific_point = X_test_optimal_pca[0]

# Make a prediction for the specific point using the selected model
y_pred = predict(weights_optimal, specific_point)

# Print the predicted y value
print("Predicted y value:", y_pred)

## 6. Conclusion and Analysis

**Interpretation of the Graph:**
The graph of the number of components vs RMSE provides valuable insights into the trade-off between model complexity and prediction accuracy. In the plot, we observed how the RMSE changes as the number of principal components increases. The key point of interest is where the RMSE reaches a minimum or starts stabilizing. This stable point represents the optimal number of components for building an efficient predictive model. In the plot, we identified this point and marked it with a red dashed line.

**Significance of Selecting an Appropriate Number of Components:**
Selecting an appropriate number of components is crucial for achieving a balance between model simplicity and predictive accuracy. Too few components may lead to underfitting, where the model fails to capture important patterns in the data. On the other hand, too many components can result in overfitting, where the model fits the training data too closely and fails to generalize well to new, unseen data.

The significance lies in finding the sweet spot where the model captures the essential information in the data while avoiding unnecessary complexity. The optimal number of components identified from the graph represents the model configuration that strikes this balance, offering a good compromise between accuracy and efficiency.

**Analysis of the Predicted Value (y_pred):**
After selecting the optimal model based on the number of components, we tested its performance by predicting a specific data point (y_pred). The predicted value (y_pred) represents the model's estimate of the target variable for that particular input. It is essential to analyze the significance of this prediction in the context of your specific application.

**Accuracy Assessment:**
We can compare the predicted value (y_pred) with the actual target value to assess the accuracy of the model. we have calculated Mean Absolute Error (MAE), to provide a more comprehensive evaluation.

In [None]:
mae = MAE(y_test, np.array([predict(weights_optimal, point) for point in X_test_optimal_pca]))
print("Mean Absolute Error (MAE):", mae)