In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np

# Sample data
data = {
    'Temperature': [75, 80, 85, 70, 65, 60, 90],
    'Cloud Cover': ['Sunny', 'Partly Cloudy', 'Overcast', 'Sunny', 'Overcast', 'Partly Cloudy', 'Overcast'],
    'Humidity': ['Low', 'High', 'High', 'Medium', 'Medium', 'Low', 'High'],
    'Weather': ['Sunny', 'Sunny', 'Rainy', 'Sunny', 'Stormy', 'Sunny', 'Rainy']
}

# Create a DataFrame
df = pd.DataFrame(data)

# 1. Discretize the feature Temperature
temperature_bins = [0, 70, 80, 100]
temperature_labels = ['warm', 'hot', 'very hot']
df['Temperature_Category'] = pd.cut(df['Temperature'], bins=temperature_bins, labels=temperature_labels)

# 2. Calculate the Entropy of the target class Weather
def calculate_entropy(target_column):
    unique_values, counts = np.unique(target_column, return_counts=True)
    probabilities = counts / len(target_column)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

entropy_weather = calculate_entropy(df['Weather'])

# 3. Calculate Information Gain for each feature
def calculate_information_gain(data, feature, target):
    entropy_total = calculate_entropy(data[target])
    unique_values = data[feature].unique()
    weighted_entropy = 0
    
    for value in unique_values:
        subset = data[data[feature] == value]
        entropy_subset = calculate_entropy(subset[target])
        weight = len(subset) / len(data)
        weighted_entropy += weight * entropy_subset
    
    information_gain = entropy_total - weighted_entropy
    return information_gain

# Calculate Information Gain for each feature
information_gains = {}
features = ['Temperature_Category', 'Cloud Cover', 'Humidity']

for feature in features:
    information_gain = calculate_information_gain(df, feature, 'Weather')
    information_gains[feature] = information_gain

# 4. Order the features according to their IG
sorted_features = sorted(information_gains.items(), key=lambda x: x[1], reverse=True)

# Display results
print("Information Gains:")
for feature, ig in sorted_features:
    print(f"{feature}: {ig}")


Information Gains:
Temperature_Category: 0.9852281360342513
Cloud Cover: 0.9852281360342513
Humidity: 0.6995138503199656


In [3]:
import pandas as pd
import numpy as np

# Provided dataset
data = {
    'x': [12, 17, 12, 6, 17, 4],
    'y': [24, 15.5, 13, 13.5, 21, 20.3],
    'z': [6, -2, 3, -2.5, 7.2, -0.9],
}

# Creating a DataFrame
df = pd.DataFrame(data, index=['R1', 'R2', 'R3', 'R4', 'R5', 'R6'])

print("Step 0: Original Data")
print(df)

# Step 1: Calculate mean and standard deviation for each variable (X, Y, Z)
mean_values = df.mean()
std_dev_values = df.std()

print("\nStep 1: Mean Values")
print(mean_values)
print("\nStep 1: Standard Deviation Values")
print(std_dev_values)

# Step 2: Standardize the data using Z-score
standardized_data = (df - mean_values) / std_dev_values

print("\nStep 2: Standardized Data")
print(standardized_data)

# Step 3: Calculate the covariance matrix
covariance_matrix = np.cov(standardized_data, rowvar=False)

print("\nStep 3: Covariance Matrix")
print(covariance_matrix)

# Step 4: Find eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

print("\nStep 4: Eigenvalues")
print(eigenvalues)
print("\nStep 4: Eigenvectors")
print(eigenvectors)

# Step 5: Sort eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nStep 5: Sorted Eigenvalues")
print(eigenvalues)
print("\nStep 5: Sorted Eigenvectors")
print(eigenvectors)

# Step 6: Decide how many principal components to keep
total_variance = np.sum(eigenvalues)
explained_variance_ratio = eigenvalues / total_variance

cumulative_explained_variance = np.cumsum(explained_variance_ratio)
num_components_to_keep = np.argmax(cumulative_explained_variance >= 0.9) + 1

print("\nStep 6: Explained Variance Ratio")
print(explained_variance_ratio)
print("\nStep 6: Cumulative Explained Variance")
print(cumulative_explained_variance)
print("\nStep 6: Number of Components to Keep")
print(num_components_to_keep)

# Step 7: Form the projection matrix V
projection_matrix = eigenvectors[:, :num_components_to_keep]

print("\nStep 7: Projection Matrix")
print(projection_matrix)

# Step 8: Calculate the reduced data matrix R
reduced_data = np.dot(standardized_data, projection_matrix)

print("\nStep 8: Reduced Data Matrix R")
print(reduced_data)


Step 0: Original Data
     x     y    z
R1  12  24.0  6.0
R2  17  15.5 -2.0
R3  12  13.0  3.0
R4   6  13.5 -2.5
R5  17  21.0  7.2
R6   4  20.3 -0.9

Step 1: Mean Values
x    11.333333
y    17.883333
z     1.800000
dtype: float64

Step 1: Standard Deviation Values
x    5.428321
y    4.510174
z    4.206186
dtype: float64

Step 2: Standardized Data
           x         y         z
R1  0.122813  1.356193  0.998529
R2  1.043908 -0.528435 -0.903431
R3  0.122813 -1.082737  0.285294
R4 -0.982502 -0.971877 -1.022304
R5  1.043908  0.691030  1.283823
R6 -1.350940  0.535826 -0.641912

Step 3: Covariance Matrix
[[1.         0.08686417 0.48527262]
 [0.08686417 1.         0.61189326]
 [0.48527262 0.61189326 1.        ]]

Step 4: Eigenvalues
[1.82462432 0.91547483 0.25990085]

Step 4: Eigenvectors
[[-0.46348371 -0.78498896 -0.41106591]
 [-0.55899398  0.61896667 -0.55173   ]
 [-0.68753806  0.0259345   0.72568507]]

Step 5: Sorted Eigenvalues
[1.82462432 0.91547483 0.25990085]

Step 5: Sorted Eigenvecto