In [9]:
import numpy as np
import pandas as pd
import sklearn
import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix


Task 1

In [10]:
db = pd.read_csv("mammographic_masses_data.csv")

Task 2

In [11]:
db.describe()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.300313,55.487448,2.721505,2.796276,2.910734,0.463059
std,0.683469,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [12]:
print(db.loc[db['Severity'] == 1, 'Margin'])

0      5.0
1      1.0
2      5.0
4      5.0
8      5.0
      ... 
951    5.0
952    4.0
955    4.0
957    5.0
959    5.0
Name: Margin, Length: 445, dtype: float64


In [13]:
# 1. Scatter Plot
scatter_fig = px.scatter(db, x='Age', y='Shape', color='Severity', title='Scatter Plot of Age vs. Shape')
scatter_fig.show()

# 2. Heatmap/Correlation Matrix
correlation_matrix = db.corr()
heatmap_fig = px.imshow(correlation_matrix, color_continuous_scale='Viridis', title='Correlation Matrix')
heatmap_fig.show()

# 3. Distribution Plot
for column in db.columns:
    distribution_fig = px.histogram(db, x=column, color='Severity', title=column + ' Distribution by Severity')
    distribution_fig.show()

Task 3

In [14]:
cleaned_db = db.dropna()

print(db.describe())
print()
print(cleaned_db.describe())

for column in db.columns:
    distribution_fig = px.histogram(db, x=column, color='Severity', title=column + ' Distribution by Severity')
    distribution_fig.show()
    distribution_fig = px.histogram(cleaned_db, x=column, color='Severity', title=column + ' Distribution by Severity')
    distribution_fig.show()

               BA         Age       Shape      Margin     Density    Severity
count  959.000000  956.000000  930.000000  913.000000  885.000000  961.000000
mean     4.300313   55.487448    2.721505    2.796276    2.910734    0.463059
std      0.683469   14.480131    1.242792    1.566546    0.380444    0.498893
min      0.000000   18.000000    1.000000    1.000000    1.000000    0.000000
25%      4.000000   45.000000    2.000000    1.000000    3.000000    0.000000
50%      4.000000   57.000000    3.000000    3.000000    3.000000    0.000000
75%      5.000000   66.000000    4.000000    4.000000    3.000000    1.000000
max      6.000000   96.000000    4.000000    5.000000    4.000000    1.000000

               BA         Age       Shape      Margin     Density    Severity
count  830.000000  830.000000  830.000000  830.000000  830.000000  830.000000
mean     4.338554   55.781928    2.781928    2.813253    2.915663    0.485542
std      0.660689   14.671782    1.242361    1.567175    0.3509

In [19]:
columns_to_normalize = ['Age']

# Manual normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

normalized_pd = cleaned_db.copy()
for column in columns_to_normalize:
    normalized_pd[column] = normalize_column(normalized_pd[column])


print(normalized_pd.describe())
for column in columns_to_normalize:
    # Plot 1: Original Uncleaned Data
    distribution_fig = px.histogram(cleaned_db, x=column, color='Severity', nbins=20, title=column + ' Uncleaned Data Distribution by Severity')
    distribution_fig.show()
    # Plot 2: Cleaned Data
    scatter_fig_cleaned = px.histogram(normalized_pd, x=column, color='Severity', nbins=20, title=column + ' Cleaned Data Distribution by Severity')
    scatter_fig_cleaned.show()

               BA         Age       Shape      Margin     Density    Severity
count  830.000000  830.000000  830.000000  830.000000  830.000000  830.000000
mean     4.338554    0.484384    2.781928    2.813253    2.915663    0.485542
std      0.660689    0.188100    1.242361    1.567175    0.350936    0.500092
min      0.000000    0.000000    1.000000    1.000000    1.000000    0.000000
25%      4.000000    0.358974    2.000000    1.000000    3.000000    0.000000
50%      4.000000    0.500000    3.000000    3.000000    3.000000    0.000000
75%      5.000000    0.615385    4.000000    4.000000    3.000000    1.000000
max      6.000000    1.000000    4.000000    5.000000    4.000000    1.000000


Task 4

In [20]:
import pandas as pd
import plotly.express as px
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Split the data into X and y
X = normalized_pd.loc[:, normalized_pd.columns != "Severity"]
y = normalized_pd['Severity']

# Select the best 2 features
selector = SelectKBest(f_classif, k=3)
X_new = selector.fit_transform(X, y)
selected_feature_indices = selector.get_support(indices=True)

# Create a DataFrame with the selected features and target variable
selected_features = X.columns[selected_feature_indices]
selected_features_df = pd.DataFrame(X, columns=selected_features)
selected_features_df['Severity'] = y

# Create histogram plots for the selected features
for feature in selected_features:
    distribution_fig = px.histogram(selected_features_df, x=feature, color='Severity', title=feature + ' Distribution by Severity')
    distribution_fig.show()


In [25]:
pca = PCA(n_components=2)
selected_features_df['Severity'] = selected_features_df['Severity'].astype(str)
components = pca.fit_transform(selected_features_df[selected_features])
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
fig = px.scatter(components, x='Component 1', y='Component 2', color=selected_features_df['Severity'])

for i, feature in enumerate(selected_features):
    fig.add_annotation(
        ax=0, ay=0,
        axref="x", ayref="y",
        x=loadings[i, 0],
        y=loadings[i, 1],
        showarrow=True,
        arrowsize=2,
        arrowhead=2,
        xanchor="right",
        yanchor="top"
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
        yshift=5,
    )
fig.show()



      BA  Shape  Margin Severity
0    5.0    3.0     5.0        1
2    5.0    4.0     5.0        1
3    4.0    1.0     1.0        0
8    5.0    1.0     5.0        1
10   5.0    1.0     4.0        1
..   ...    ...     ...      ...
956  4.0    2.0     1.0        0
957  4.0    4.0     5.0        1
958  4.0    4.0     5.0        0
959  5.0    4.0     5.0        1
960  4.0    3.0     3.0        0

[830 rows x 4 columns]


In [27]:
tsvd = TruncatedSVD(n_components=2, random_state=0)

# Fit and transform the selected features
components = tsvd.fit_transform(selected_features_df[selected_features])

# Create a DataFrame for the components
components_df = pd.DataFrame(data=components, columns=['Component 1', 'Component 2'])

# Add the 'Severity' column back to the DataFrame
components_df['Severity'] = selected_features_df['Severity']

# Create a scatter plot
fig = px.scatter(components_df, x='Component 1', y='Component 2', color='Severity')

# Show the plot
fig.show()