In [25]:
import numpy as np
import pandas as pd
import sklearn
import plotly.express as px 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix


Task 1

In [26]:
pd = pd.read_csv("mammographic_masses_data.csv")

Task 2

In [27]:
pd.describe()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.300313,55.487448,2.721505,2.796276,2.910734,0.463059
std,0.683469,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [28]:
print(pd.loc[pd['Severity'] == 1, 'Margin'])

0      5.0
1      1.0
2      5.0
4      5.0
8      5.0
      ... 
951    5.0
952    4.0
955    4.0
957    5.0
959    5.0
Name: Margin, Length: 445, dtype: float64


In [29]:
# 1. Scatter Plot
scatter_fig = px.scatter(pd, x='Age', y='Shape', color='Severity', title='Scatter Plot of Age vs. Shape')
scatter_fig.show()

# 2. Heatmap/Correlation Matrix
correlation_matrix = pd.corr()
heatmap_fig = px.imshow(correlation_matrix, color_continuous_scale='Viridis', title='Correlation Matrix')
heatmap_fig.show()

# 3. Distribution Plot
distribution_fig = px.histogram(pd, x='Age', color='Severity', title='Age Distribution by Severity')
distribution_fig.show()

Task 3

In [30]:
# Plot 1: Original Uncleaned Data
scatter_fig_uncleaned = px.scatter(pd, x='Age', y='Shape', color='Severity', title='Uncleaned Data Scatter Plot')
scatter_fig_uncleaned.show()

# Plot 2: Cleaned Data
cleaned_pd = pd.dropna()
scatter_fig_cleaned = px.scatter(cleaned_pd, x='Age', y='Shape', color='Severity', title='Cleaned Data Scatter Plot')
scatter_fig_cleaned.show()


In [31]:
columns_to_normalize = ['Age', 'Shape']

# Manual normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

normalized_pd = cleaned_pd.copy()
for column in columns_to_normalize:
    normalized_pd[column] = normalize_column(normalized_pd[column])

# Plot 1: Original Data
scatter_fig_original = px.scatter(
    pd, x='Age', y='Shape', color='Severity', title='Uncleaned Data Scatter Plot'
)
scatter_fig_original.show()

# Plot 2: Normalized Data
scatter_fig_normalized = px.scatter(
    normalized_pd, x='Age', y='Shape', color='Severity', title='Normalized Data Scatter Plot'
)
scatter_fig_normalized.show()

Task 4

In [32]:
import pandas as pd
import plotly.express as px
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Assuming you already have normalized_pd defined

# Split the data into X and y
X = normalized_pd.loc[:, normalized_pd.columns != "BA"]
y = normalized_pd['BA']

# Select the best 2 features
X_new = SelectKBest(f_classif, k=2).fit_transform(X, y)

# Create a DataFrame with the selected features and target variable
df = pd.DataFrame({'Target': y})
for i in range(X_new.shape[1]):
    df[f'Feature {i + 1}'] = X_new[:, i]
    fig = px.violin(df, y='Target', x=f'Feature {i + 1}', box=True, points="all", title='Violin Plot of ' + f'Feature {i + 1}' + ' vs. Target')
    fig.show()


In [33]:
n_components = X_new.shape[1]

# Apply PCA with the specified number of components
pca = PCA(n_components=n_components)
pca.fit(X_new)

# Transform the data to the PCA space
X_pca = pca.transform(X_new)

# Create a scatter plot to visualize the data in the PCA space
df_pca = pd.DataFrame({f'PC{i+1}': X_pca[:, i] for i in range(n_components)})
df_pca['BA'] = normalized_pd['BA']
fig = px.scatter(df_pca, x=f'PC1', y=f'PC2', color='BA',
                 title=f'Scatter Plot of PCA (n_components={n_components})')

# Add eigenvectors as lines with different colors
eigenvectors = pca.components_.T  # Transpose to match the data shape
mean_point = df_pca.mean()  # Calculate the mean of the data

# Define a list of colors
colors = ['red', 'green', 'blue']  # You can add more colors as needed

for i, (x, y) in enumerate(zip(eigenvectors[:, 0], eigenvectors[:, 1])):
    scale_factor = 3  # Adjust this factor to control the length of eigenvectors
    end_x = mean_point[f'PC{i+1}'] + x * scale_factor
    end_y = mean_point[f'PC{i+1}'] + y * scale_factor
    fig.add_shape(
        type='line',
        x0=mean_point['PC1'],
        y0=mean_point['PC2'],
        x1=end_x,
        y1=end_y,
        line=dict(color=colors[i], width=2),
        name=f'Eigenvector {i+1}'
    )

fig.show()

# Create a bar chart to show the explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_
df_variance = pd.DataFrame({'Principal Component': range(1, n_components + 1),
                                'Explained Variance Ratio': explained_variance_ratio})
fig_variance = px.bar(df_variance, x='Principal Component', y='Explained Variance Ratio',
                           title=f'Explained Variance Ratio for Each Principal Component (n_components={n_components})')
fig_variance.show()

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['SVD1', 'BA'] but received: SVD2