# Breast Cancer Section

### NOTE: Music section is the second half of the code

<strong>Attribute Information:</strong>

- ID number
- Diagnosis (M = malignant, B = benign)

<strong>Ten real-valued features are computed for each cell nucleus:</strong>

- radius (mean of distances from center to points on the perimeter)
- texture (standard deviation of gray-scale values)
- perimeter
- area
- smoothness (local variation in radius lengths)
- compactness (perimeter^2 / area - 1.0)
- concavity (severity of concave portions of the contour)
- concave points (number of concave portions of the contour)
- symmetry
- fractal dimension ("coastline approximation" - 1)
<br>



# 1.1 Loading Libraries and Utilies

## Importing all the libraries

In [1]:
import warnings 
warnings.filterwarnings("ignore")

# Base libraries
import os
import numpy as np
import pandas as pd
from IPython.display import display_html


## visualization libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns


# stat tools
import statsmodels.api as sm
from scipy.stats import kurtosis, skew

## preprocessing & otherlibraries
from sklearn.model_selection import (train_test_split)


## data sampling and outlier detection libraries

from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

#from umap import UMAP
from sklearn.decomposition import PCA

# modeling
from sklearn.linear_model import (LogisticRegression, LinearRegression) 
from sklearn.svm import SVC

# metrics
from sklearn.metrics import (r2_score, 
                             accuracy_score)

## plot settings

sns.set_style('white')
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['axes.spines.left'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = False
plt.rcParams.update({'font.size':14})
plt.rcParams['font.weight']= 'normal'


## Loading Dataset

In [2]:
df = pd.read_csv('breast_cancer.csv', delimiter = ',', encoding = 'utf-8')

Missing Values and NaN values

In [None]:
print(df.isnull().sum())



In [None]:
df.drop(columns = ['Unnamed: 32'], inplace = True)
df['diagnosis'] = df['diagnosis'].map({'B':0, 'M':1})

print(df.head(2).T)

In [None]:
print('\nTotal Number of Null values: \n' +  str(df.isnull().sum().sum()))
print('\nAny NUll Value columns: \n' + str(df.isnull().sum().any()))

Descriptive Statistics of The Data

In [None]:
# Prepare the dataset for descriptive statistics
stats_df = df.drop(columns=['id']).describe().T.reset_index().rename(columns={'index': 'Features'})
stats_df['count'] = stats_df['count'].astype(int)

# Style the DataFrame
style = stats_df.style.set_table_attributes("style='display:inline'") \
                     .bar(subset=['mean', 'std', 'min', '25%', '50%', '75%', 'max'], axis=1, color='#fed766') \
                     .format({
                         'mean': "{:20,.3f}",
                         'std': "{:20,.3f}",
                         'min': "{:20,.3f}",
                         '25%': "{:20,.3f}",
                         '50%': "{:20,.3f}",
                         '75%': "{:20,.3f}",
                         'max': "{:20,.3f}"
                     }) \
                     .format({"Features": lambda x: x.upper()}) \
                     .set_properties(**{'background-color': 'white', 'color': 'black'})

# Display the styled DataFrame
display_html(style._repr_html_(), raw=True)

Color Palette for visualizaitons

In [None]:

colors= ['#fe4a49' ,'#2ab7ca' ,'#fed766' ,'#e6e6ea' ,'#f4f4f8']

sns.palplot(colors,size = 3)

plt.gcf().set_size_inches(15,5)

plt.text(-0.75,-0.75, 'Color Palette',{'fontfamily':'serif', 'size':24, 'weight':'bold'})
plt.text(-0.75,-0.68, 'Lets try to stick to these colors throughout presentation.',{'fontfamily':'serif', 'size':16},alpha = 0.9)
for idx,values in enumerate(colors):
    plt.text(idx-0.25,0, colors[idx],{'fontfamily':'serif', 'size':16, 'weight':'bold','color':'black'}, alpha =0.8)
plt.gcf().set_facecolor('white')
plt.box(None)
plt.axis('off')
plt.show()


In [None]:
# Null accuracy score for current data
NUll_acc = round(max(df.diagnosis.values.mean(), 1 - df.diagnosis.values.mean()), 2)

print('\nNull Accuracy Score: ' + str(NUll_acc) + '\n')
print('This is the baseline our model needs to cross.\n')


# 2. Data Exploration and Explinatory Analysis

My strategy for this analysis as follows
<ul>
    <li>Target Distribution</li>
    <li>Univariate Analysis</li>
    <li>Binary Feature Analysis</li>
    <li>Multivariate Analysis</li>
    <li>Class Segregation with Dimensionality Reduction </li>
</ul>


## 2.1 Distribution of Targets

In [None]:
# Prepare data
feat_df = df.drop(columns=['id', 'diagnosis'])
tar_df = df['diagnosis']
cancer_dist = round(tar_df.value_counts(normalize=True), 2) * 100

# Create a bar plot
fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(['Healthy', 'Cancerous'], cancer_dist.values, color=[colors[2], colors[0]])

# Add labels to each bar
for bar, percentage in zip(bars, cancer_dist.values):
    ax.text(
        bar.get_x() + bar.get_width() / 2, 
        bar.get_height() - 5, 
        f'{percentage}%', 
        ha='center', 
        va='top', 
        fontsize=12, 
        fontweight='bold', 
        color='white'
    )

# Titles and descriptive text
ax.set_title('How Susceptible Are Women To Breast Cancer?', fontsize=16, fontweight='bold')

# Add legend
ax.legend(['Healthy', 'Cancerous'], loc='upper right')

# Remove unnecessary axes for a cleaner look
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Display the plot
plt.show()

## 2.2 Univariate Analysis of Features

In [None]:
fig, ax = plt.subplots(nrows=10, ncols=3, figsize=(12, 24), dpi=80)
axes = ax.ravel()

# Loop through each feature column and corresponding axis
for col, ax in zip(feat_df.columns, axes):
    
    # Determine color based on skewness
    color = colors[0] if skew(feat_df[col]) > 1 else colors[1]
    
    # Violin plot
    sns.violinplot(data=feat_df, x=col, ax=ax, color=color, cut=0, inner='box', linewidth=3)
    
    # Plot settings
    xlabel = ' '.join([word.capitalize() for word in col.split('_')])
    ax.set_xlabel(xlabel, fontsize=14, fontweight='bold')
    ax.get_yaxis().set_visible(False)

# Adjust layout
plt.tight_layout(pad=3, h_pad=2.5, w_pad=2.5)

# Titles and additional text
fig.suptitle('Overview of Univariate Feature Distribution', fontsize=22, fontweight='bold', x=0.5, y=1.02)
fig.text(0.65, 1, "Skewed", fontsize=16, fontweight='bold', color=colors[0])
fig.text(0.73, 1, '|', fontsize=16, fontweight='bold')
fig.text(0.74, 1, "Relative Normal", fontsize=16, fontweight='bold', color=colors[1])

# Display the plot
plt.show()

## 2.3 Univariate Analysis of Features wrt Targets

In [None]:
fig, ax = plt.subplots(nrows=10, ncols=3, figsize=(12, 24), dpi=80)
axes = ax.ravel()

# Loop through each feature column and corresponding axis
for col, ax in zip(feat_df.columns, axes):
    
    # KDE plot with hue for diagnosis
    sns.kdeplot(
        data=df, x=col, ax=ax, shade=True,
        palette=[colors[0], colors[2]],
        alpha=0.95, linewidth=3, ec='black',
        hue='diagnosis', hue_order=[1, 0],
        legend=False
    )
    
    # Plot settings
    xlabel = ' '.join([value.capitalize() for value in str(col).split('_')])
    ax.get_yaxis().set_visible(False)
    ax.set_xlabel(xlabel, fontsize=14, fontweight='bold')
   

# Adjust layout
plt.tight_layout(pad=3, h_pad=1.5, w_pad=1.5)

# Titles and additional text
fig.suptitle('Distribution of Cancer Cells on Feature Level', fontsize=22, fontweight='bold', x=0.5, y=1.03)

fig.text(0.615, 1, "Cancerous", fontsize=16, fontweight='bold', color=colors[0], alpha=1)
fig.text(0.73, 1, '|', fontsize=16, fontweight='bold')
fig.text(0.74, 1, "Healthy", fontsize=16, fontweight='bold', color=colors[2], alpha=1)

# Display the plot
plt.show()

## 2.4 Multivariate Analysis of Features In Same Category

Featues Segregation based on mean,se,and worst

In [None]:
# Measurement and characteristics keyword lists
measure_keyword = ['radius', 'perimeter', 'area', 'concavity', 'concave points']
character_keyword = ['texture', 'smoothness', 'compactness', 'symmetry', 'fractal']

# Mean, standard error, and worst measure feature lists
mean_measure, mean_character = ['diagnosis'], ['diagnosis']
se_measure, se_character = ['diagnosis'], ['diagnosis']
worst_measure, worst_character = ['diagnosis'], ['diagnosis']

# Loop to create required mean, standard error, and worst measure features
for col in feat_df.columns:
    name_list = str(col).split('_')
    
    if name_list[0] in measure_keyword:
        if 'mean' in name_list:
            mean_measure.append(col)
        elif 'se' in name_list:
            se_measure.append(col)
        else:
            worst_measure.append(col)           
    
    if name_list[0] in character_keyword:
        if 'mean' in name_list:
            mean_character.append(col)
        elif 'se' in name_list:
            se_character.append(col)
        else:
            worst_character.append(col) 
            
# Descriptions and lists
print('\nSeparated Features are stored into lists:\n')
print('Mean of Measurements: ' + ', '.join(mean_measure[1:]) + '\n')
print('Mean of Characteristics: ' + ', '.join(mean_character[1:]) + '\n')
print('Standard Error of Measurements: ' + ', '.join(se_measure[1:]) + '\n')
print('Standard Error of Characteristics: ' + ', '.join(se_character[1:]) + '\n')
print('Worst of Measurements: ' + ', '.join(worst_measure[1:]) + '\n')
print('Worst of Characteristics: ' + ', '.join(worst_character[1:]) + '\n')

In [13]:
# Getting Mean Columns with diagnosis
m_col = ['diagnosis','radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

# Getting Se Columns with diagnosis
s_col = ['diagnosis','radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se']

# Getting Worst column with diagnosis
w_col = ['diagnosis','radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

In [None]:
# pairplot for mean columns
sns.pairplot(df[m_col], hue='diagnosis', palette=[colors[0], colors[1]])

In [None]:
# pairplot for se columns
sns.pairplot(df[s_col], hue='diagnosis', palette=[colors[0], colors[1]])

In [None]:
# pairplot for worst columns
sns.pairplot(df[w_col], hue='diagnosis', palette=[colors[0], colors[1]])

## 2.5 Correlation Based Multivariate Analysis

With the analysis of single category features, lets try to make things more interesting with bringing correlation into the picture. lets try to get the high positively correlated nad negatively correlated features and see how are they correlated to each other. This is crutial for understanding the collinearity of the data...

In [17]:
temp_df = df.corr().unstack().reset_index()


### cross relational positive features

positive_corr_df = (temp_df[(temp_df[0]>0.9) &
         (temp_df['level_0'] != temp_df['level_1']) & 
         ((temp_df['level_0'].apply(lambda x: str(x).split('_')[-1])) != (temp_df['level_1'].apply(lambda x: str(x).split('_')[-1])))])

positive_corr_df['z'] = positive_corr_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
positive_corr_df.drop_duplicates(subset="z", keep="first" , inplace = True ) 
positive_corr_df.drop(columns = ['z'], inplace = True)



### cross relational negative features

negative_corr_df = (temp_df[(temp_df[0]<-0.2) &
         (temp_df['level_0'] != temp_df['level_1']) & 
         ((temp_df['level_0'].apply(lambda x: str(x).split('_')[-1])) != (temp_df['level_1'].apply(lambda x: str(x).split('_')[-1])))])

negative_corr_df['z'] = negative_corr_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
negative_corr_df.drop_duplicates(subset="z", keep="first" , inplace = True ) 
negative_corr_df.drop(columns = ['z'], inplace = True)

In [None]:
print('\nHelper function to visualize the cross categorical Feature analysis\n')
def plot_cross_scatter(corr_df, data =df,title = None,nrows = 4, ncols = 3, figsize = (12,24), colors = colors):
    
    col1_list = corr_df['level_0'].values.tolist()
    col2_list = corr_df['level_1'].values.tolist()
    
    ## plotting
    fig,axes = plt.subplots(nrows,ncols, figsize = (15,20))
    
    # removing the last axes
    axes.ravel()[-1].axes.get_xaxis().set_visible(False)
    axes.ravel()[-1].axes.get_yaxis().set_visible(False)
    
    for ax,col1,col2 in zip(axes.ravel(), col1_list,col2_list):
        
        sns.scatterplot(x= data[col1], y = data[col2], ax = ax,size = 100, 
                        linewidth= 0.5, edgecolor = 'black',
                        hue = data['diagnosis'], hue_order = [1,0],
                        palette = [colors[0],colors[2]], legend = False )
        
        ## plot setting
        xlabel = ' '.join([value.capitalize() for value in str(col1).split('_') ])
        ylabel = ' '.join([value.capitalize() for value in str(col2).split('_') ])
        
        ax.axes.set_xlabel(xlabel,{'font':'serif','size':14, 'weight':'bold'}, alpha = 1)
        ax.axes.set_ylabel(ylabel,{'font':'serif','size':14, 'weight':'bold'}, alpha = 1) 
        
        ax.set_xticklabels('')
        ax.set_yticklabels('')
        
    
    ## titles and text
    fig.text(0.05,0.935,'{}'.format(title), {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)

    fig.text(0.63,0.885, "Cancerous",{'font':'serif','size':16, 'weight':'bold', 'color':colors[0]}, alpha = 1)
    fig.text(0.735,0.885, '|',{'font':'serif','size':16, 'weight':'bold'})
    fig.text(0.745,0.885, "Healthy",{'font':'serif','size':16, 'weight':'bold','color':colors[2]}, alpha = 1)

    fig.show()
    
    return None


Positively Correlated CrossCategorical Features

In [None]:
plot_cross_scatter(positive_corr_df, title = 'CrossCategorical Positively Related Features')

Negatively Correlated CrossCategorical Features

In [None]:
plot_cross_scatter(negative_corr_df,nrows = 4,ncols = 2, figsize=(12,6)
                   ,title = 'CrossCategorical Negitively Correlated Features')

In [None]:
# Select pairs of features for visualization
feature_pairs = [
    ("radius_mean", "texture_mean"),
    ("perimeter_mean", "area_mean"),
    ("concavity_mean", "concave points_mean")
]

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6), dpi=85)

for i, (feature_x, feature_y) in enumerate(feature_pairs):
    ax = axes[i]

    # Prepare data for the selected pair of features
    X_pair = df[[feature_x, feature_y]].values
    y = df['diagnosis'].values
    
    # Fit logistic regression or SVM model to the pair of features
    model = LogisticRegression()  # Use SVC(kernel='linear') for a linear SVM instead
    model.fit(X_pair, y)

    # Scatter plot of data points
    sns.scatterplot(
        x=df[feature_x], y=df[feature_y], hue=df['diagnosis'], 
        palette=colors, s=50, edgecolor='black', alpha=0.8, ax=ax, legend=False
    )
    
    # Plot decision boundary
    x_min, x_max = X_pair[:, 0].min() - 1, X_pair[:, 0].max() + 1
    y_min, y_max = X_pair[:, 1].min() - 1, X_pair[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, levels=[-1, 0, 1], colors=[colors[1], colors[0]], alpha=0.3)
    
    # Set labels and titles
    ax.set_xlabel(" ".join(feature_x.split('_')).capitalize(), fontsize=12, fontweight='bold')
    ax.set_ylabel(" ".join(feature_y.split('_')).capitalize(), fontsize=12, fontweight='bold')
    ax.set_title(f"{feature_x} vs {feature_y}", fontsize=14, fontweight='bold')
    
fig.suptitle("Decision Boundaries for Selected Pairs of Features", fontsize=18, fontweight='bold')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# 3. Data Cleaning Techniques And Feature Engineering

## 3.1 Outliers And Influential Points

Ouliers Detection and Helper Functions

In [22]:
def outlier_detect(algo, data):
    cols = data.drop(columns=['id']).columns
    # Creating feature and target numpy arrays
    feat, tar = data[cols].drop(columns='diagnosis').values, data['diagnosis'].values
    # Fitting the features to algo
    yhat = algo.fit_predict(feat)
    # Masking the features that are not outliers
    mask = yhat != -1
    X, y = feat[mask, :], tar[mask]
    data_inarray = np.append(y.reshape(-1, 1), X, axis=1)
    return pd.DataFrame(data=data_inarray, columns=cols)

def skew_sum(data):
    return skew(data).sum()

def kurtosis_sum(data):
    return kurtosis(data).sum()

def shape(data): 
    return data.shape

In [None]:
outlier_algos = [
    IsolationForest(contamination=0.05),
    EllipticEnvelope(contamination=0.05),
    LocalOutlierFactor(contamination=0.05),
    DBSCAN(eps=70, min_samples=10)
]

df_list = [df.drop(columns=['id'])]
shapes = [df.drop(columns=['id']).shape[0]]
skews = [skew(df.drop(columns=['id']))]
kurts = [kurtosis(df.drop(columns=['id']))]

for algo in outlier_algos:
    corrected_df = outlier_detect(algo, df)
    df_list.append(corrected_df)
    shapes.append(corrected_df.shape[0])
    skews.append(skew(corrected_df))
    kurts.append(kurtosis(corrected_df))
        
algorithms = ['Original', 'IsolationForest', 'EllipticEnvelope', 'LocalOutlierFactor', 'DBSCAN']
outliers_info = pd.DataFrame({
    'algorithms': algorithms,
    'df_list': df_list,
    'shapes': shapes,
    'skews': skews,
    'kurts': kurts
})

outliers_info['skews_sum'] = outliers_info['skews'].apply(lambda x: round(x.sum(), 2))
outliers_info['kurts_sum'] = outliers_info['kurts'].apply(lambda x: round(x.sum(), 2))
outliers_info.sort_values(by='shapes', inplace=True)
outliers_info.reset_index(drop=True, inplace=True)

for idx, df_ in enumerate(outliers_info['df_list']):
    lr = LinearRegression()
    X = df_.drop(columns=['diagnosis'])
    y = df_['diagnosis']
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)
    
    # Linear regression
    preds = LinearRegression().fit(xtrain.values, ytrain.values).predict(xtest.values)
        
    r2 = round(r2_score(ytest, preds), 3)
    outliers_info.loc[idx, 'r2_score'] = r2

print('\nAll the corrected data is stored in the Outliers_info DataFrame\n')

print(outliers_info.T)

Visualise the results of outlier detection

In [24]:
class OutlierViz:
    
    def __init__(self, ax, orig_feat=None, corrected_feat=None): 
        self.x_org = orig_feat
        self.x_corr = corrected_feat
        self.ax = ax

    def visualize_data(self, name=None, r2=None, orig_r2=None):
        
        self.ax.set_facecolor('white')
        
        # Dimension reduction with PCA
        pca1 = PCA(n_components=2).fit_transform(self.x_org)
        pca2 = PCA(n_components=2).fit_transform(self.x_corr)
        
        # Plot original data points
        self.ax.scatter(
            pca1[:, 0], pca1[:, 1],
            color=colors[0], s=50, alpha=0.8, edgecolor='black', linewidth=0.5, label="Original Data"
        )
        
        # Plot corrected data points
        self.ax.scatter(
            pca2[:, 0], pca2[:, 1],
            color=colors[1], s=50, alpha=0.8, edgecolor='black', linewidth=0.5, label="Corrected Data"
        )
   
        # Text labels for name, R2 Score, and Original R2 Score with adjusted positioning
        self.ax.text(0.95, 0.9, f'{name}', transform=self.ax.transAxes, ha='right', fontsize=14, fontweight='bold')
        self.ax.text(0.95, 0.85, f'R2 Score: {r2}', transform=self.ax.transAxes, ha='right', fontsize=12)
        self.ax.text(0.95, 0.8, f'Orig R2 Score: {orig_r2}', transform=self.ax.transAxes, ha='right', fontsize=12)


In [None]:
fig, ax =plt.subplots(2,2,figsize =(13,9), dpi = 70)
axes = ax.ravel()
for ax in axes:
    ax.set_xticklabels('')
    ax.set_yticklabels('')

# plotting 
orig = outliers_info['df_list'][0]

(OutlierViz(ax = axes[0] , orig_feat = orig, corrected_feat= outliers_info['df_list'][1])
            .visualize_data(name = 'Isolation Forest', r2= outliers_info['r2_score'][1],orig_r2 = outliers_info['r2_score'][0]))

(OutlierViz(ax = axes[1], orig_feat = orig, corrected_feat= outliers_info['df_list'][2])
 .visualize_data(name = 'Eclliptic Envelope',r2= outliers_info['r2_score'][2],orig_r2 = outliers_info['r2_score'][0]))

(OutlierViz(ax = axes[2], orig_feat = orig, corrected_feat= outliers_info['df_list'][3])
 .visualize_data(name = 'Local Outlier Factor',r2= outliers_info['r2_score'][3],orig_r2 = outliers_info['r2_score'][0]))

(OutlierViz(ax = axes[3], orig_feat = orig, corrected_feat= outliers_info['df_list'][4])
 .visualize_data(name = 'DBSCAN',r2= outliers_info['r2_score'][4],orig_r2 = outliers_info['r2_score'][0]))

fig.text(-0.05,1.085,'Outliers and Original Data', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)

fig.text(0.59,1, "Original Data",{'font':'serif','size':16, 'weight':'bold', 'color':colors[1]}, alpha = 1)
fig.text(0.73,1, '|',{'font':'serif','size':16, 'weight':'bold'})
fig.text(0.74,1, "Corrected Data",{'font':'serif','size':16, 'weight':'bold','color':colors[0]}, alpha = 1)

fig.tight_layout(pad = 1.5, w_pad = 1.5,h_pad = 1.5)
fig.show()

As Overall Skewness and Kurtosis are greatly reduced and r2 score improved slightly with Isolation forest, so lets move on with this algo and explore feature level skewness and kutotsis

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 12))
axes = ax.ravel()

# Plotting Skewness
axes[0].invert_xaxis()
axes[0].barh(
    y=df.drop(columns=['id']).columns, 
    width=outliers_info['skews'][0].tolist(), 
    color=colors[0], 
    align='center', 
    label="Original"
)
axes[0].barh(
    y=df.drop(columns=['id']).columns, 
    width=outliers_info['skews'][1].tolist(), 
    color=colors[1], 
    align='center', 
    label="Skewness"
)

# Plotting Kurtosis
axes[1].barh(
    y=df.drop(columns=['id']).columns, 
    width=outliers_info['kurts'][0].tolist(), 
    color=colors[0], 
    align='center'
)
axes[1].barh(
    y=df.drop(columns=['id']).columns, 
    width=outliers_info['kurts'][1].tolist(), 
    color=colors[2], 
    align='center'
)

# Customizing labels and ticks
axes[0].set_yticklabels([])
axes[1].set_yticklabels(
    df.drop(columns=['id']).columns, 
    fontdict={'fontfamily': 'serif', 'fontsize': 12, 'fontweight': 'bold'}, 
    rotation=0, 
    ha='center'
)
axes[1].tick_params(axis='y', pad=75)
axes[0].set_xticklabels([])
axes[1].set_xticklabels([])

# Title and annotations
fig.text(0.1, 1.09, 'Feature Level Stats', fontsize=22, fontweight='bold', alpha=1)

# Legend for Skewness and Kurtosis
fig.text(0.27, 0.99, "Skewness", fontsize=18, fontweight='bold', color=colors[1])
fig.text(0.40, 0.99, '|', fontsize=18, fontweight='bold')
fig.text(0.45, 0.99, "Original", fontsize=18, fontweight='bold', color=colors[0])
fig.text(0.60, 0.99, '|', fontsize=18, fontweight='bold')
fig.text(0.62, 0.99, "Kurtosis", fontsize=18, fontweight='bold', color=colors[2])

# Adjust layout
plt.tight_layout(pad=1, h_pad=1, w_pad=1)
plt.show()

## 3.2 Correlation and Multi-Collinearity

By now outliers are removed, and not null values exits in our data.. Now lets address the collinearity in this section. collineraity could be problematic in a regression based models so keep accuracy and roc_auc_scores aside try to play round with how to tackle his multicollinearity and feature selection....

Cutsom Correlation Matrix Visualization

In [None]:
# Correlation matrix customization
corr_df = df.corr()
temp_df = corr_df.stack().reset_index()
temp_df = temp_df[temp_df[0] != 1.0].reset_index(drop=True)
temp_df['z'] = temp_df.apply(lambda x: tuple(sorted([x['level_0'], x['level_1']])), axis=1)
temp_df.drop_duplicates(subset="z", keep="first", inplace=True)
temp_df.drop(columns=['z'], inplace=True)
temp_df.reset_index(drop=True, inplace=True)

# Assign colors based on correlation value
temp_df['color'] = temp_df[0].apply(lambda x: colors[1] if x < 0.25 else (colors[2] if 0.25 < x < 0.85 else colors[0]))

print('Correlation Matrix data ready for custom visualization...\n')
print(temp_df.head(2))

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), dpi=85)

# Flip y-axis
ax.invert_yaxis()

# Scatter plot to represent correlation values
ax.scatter(
    x=temp_df['level_0'], y=temp_df['level_1'],
    s=temp_df[0] * 100, c=temp_df['color'],
    linewidth=1, edgecolor='black'
)

# Set tick labels
x_vals = temp_df['level_0'].value_counts()
y_vals = temp_df['level_1'].value_counts().sort_values()

xticklabels = [' '.join(str(col).capitalize().split('_')) for col in x_vals.index]
yticklabels = [' '.join(str(col).capitalize().split('_')) for col in y_vals.index]

# Apply labels to the x and y axes
ax.set_yticklabels(yticklabels, fontdict={'fontfamily': 'serif', 'fontsize': 10, 'fontweight': 'bold', 'color': 'black'}, alpha=0.75)
ax.set_xticklabels(xticklabels, fontdict={'fontfamily': 'serif', 'fontsize': 10, 'fontweight': 'bold', 'color': 'black'}, rotation=90, alpha=0.75)

# Titles and descriptions
fig.text(0.1, 0.98, 'Correlation Matrix and Multi-colinearity', fontsize=20, fontweight='bold', alpha=1)

# Legend text for correlation levels
fig.text(0.37, 0.75, "High", fontsize=14, fontweight='bold', color=colors[0])
fig.text(0.45, 0.75, '|', fontsize=14, fontweight='bold')
fig.text(0.48, 0.75, "Moderate", fontsize=14, fontweight='bold', color=colors[2])
fig.text(0.62, 0.75, '|', fontsize=14, fontweight='bold')
fig.text(0.65, 0.75, "Least", fontsize=14, fontweight='bold', color=colors[1])

plt.show()

From the above correalation plot, we can see that features could be highly correlated, moderately correalated, and least correlated based on color scheme. Blanks spaces indicate negative correlations and multicolinearity exits in data. There are plenty of highly correlatid functions, so we can safely say we have multi-collinearity in our data. But what is collinearity? if the two features are highly correlated then we say we have collinearity, if same occured for multiple features then it is called multi-colliearity. In general sense, it like two parallel lines, u see both have same slope and never have a intersection point, like-wise here if two features are have same information with somekind of constant multiple or something else, can be called as colinear features.

# 4. Modeling Training

In [None]:
print(df['diagnosis'].value_counts())

In [None]:
from sklearn.model_selection import train_test_split

# splitting data
X_train, X_test, y_train, y_test = train_test_split(
                df.drop('diagnosis', axis=1),
                df['diagnosis'],
                test_size=0.2,
                random_state=42)

print("Shape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

In [None]:
from sklearn.impute import SimpleImputer
# If X_train is a DataFrame, check for NaN values using isnull
if isinstance(X_train, pd.DataFrame):
    print("NaN values in X_train before imputation:", X_train.isnull().sum().sum())
else:
    print("NaN values in X_train before imputation:", np.isnan(X_train).sum())

# Impute missing values with the mean (or use 'median' if preferred)
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Check for NaN values after imputation
print("NaN values in X_train after imputation:", np.isnan(X_train).sum())
print("NaN values in X_test after imputation:", np.isnan(X_test).sum())

StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance.(Unit variance means dividing all the values by the standard deviation.)

## 4.1 Classification Models

Logistic Regression

In [32]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
predictions1 = logreg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix: \n", confusion_matrix(y_test, predictions1))
print('\n')
print(classification_report(y_test, predictions1))

In [None]:
logreg_acc = accuracy_score(y_test, predictions1)
print("Accuracy of the Logistic Regression Model is: ", logreg_acc)

Support Vector Machine

In [35]:
svc_model = SVC(kernel="rbf")
svc_model.fit(X_train, y_train)
predictions5 = svc_model.predict(X_test)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions5))
print("\n")
print(classification_report(y_test, predictions5))

In [None]:
svm_acc = accuracy_score(y_test, predictions5)
print("Accuracy of SVM model (RBF) is: ", svm_acc)

In [38]:
svc_model1 = SVC(kernel="poly")
svc_model1.fit(X_train, y_train)
predictions6 = svc_model1.predict(X_test)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions6))
print("\n")
print(classification_report(y_test, predictions6))

In [None]:
svm_acc1 = accuracy_score(y_test, predictions6)
print("Accuracy of SVM model (Polynomial) is: ", svm_acc1)

## 4.2 Final Output

In [None]:
print(logreg_acc)
print(svm_acc)
print(svm_acc1)

In [None]:
plt.figure(figsize=(12,6))
model_acc = [logreg_acc, svm_acc, svm_acc1]
model_name = ['LogisticRegression', 'SVM_rbf', 'SVM_poly']
sns.barplot(x= model_acc, y=model_name, palette='magma')

# Music Genre Classification

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
import random

In [44]:
df = pd.read_csv('music_features.csv')

In [None]:
print(df.isnull().sum())

In [None]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print("Label mapping:", label_mapping)

In [None]:
data_for_corr = df.drop(columns=['filename', 'label'])

corr_matrix = data_for_corr.corr()

plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Music Features')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='spectral_centroid', y='spectral_bandwidth', hue='label')
plt.title('Spectral Centroid vs Spectral Bandwidth')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='label', y='mfcc1')
plt.title('Distribution of MFCC1 by Genre')
plt.xticks(rotation=90)
plt.show()

In [50]:
features_to_standardize = [
    'tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 
    'spectral_bandwidth', 'rolloff', 'zero_crossing_rate'
] + [f'mfcc{i}' for i in range(1, 21)]

for feature in features_to_standardize:
    df[feature] = (df[feature] - df[feature].mean()) / df[feature].std()

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_logistic = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm.fit(X_train, y_train)

y_pred_logistic = lm.predict(X_test)

print("Logistic Regression Results on Test Set")
p, r, f, s = precision_recall_fscore_support(y_test_logistic, y_pred_logistic, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_svm_linear = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = svm.SVC(kernel='linear', class_weight='balanced')
clf.fit(X_train, y_train)

y_pred_svm_linear = clf.predict(X_test)

print("SVM - Linear Results on Test Set")
p, r, f, s = precision_recall_fscore_support(y_test_svm_linear, y_pred_svm_linear, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_svm_balanced = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = svm.SVC(kernel='rbf', class_weight='balanced')
clf.fit(X_train, y_train)

y_pred_svm_balanced = clf.predict(X_test)

print("SVM - RBF Results on Test Set with Balanced Weights:")
p, r, f, s = precision_recall_fscore_support(y_test_svm_balanced, y_pred_svm_balanced, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

custom_weights = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2}

X_train, X_test, y_train, y_test_svm_rbf = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = svm.SVC(kernel='rbf', class_weight=custom_weights)
clf.fit(X_train, y_train)

y_pred_svm_rbf = clf.predict(X_test)

print("SVM - RBF Results on Test Set with Custom Weights:")
p, r, f, s = precision_recall_fscore_support(y_test_svm_rbf, y_pred_svm_rbf, labels=np.unique(y))
label_mapping = {0: 'blues', 1: 'classical', 2: 'country', 3: 'disco', 4: 'hiphop', 5: 'jazz', 6: 'metal', 7: 'pop', 8: 'reggae', 9: 'rock'}
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

In [None]:
target_names = ['Blues', 'Classical', 'Country', 'Disco', 'Hiphop', 'Jazz', 
                'Metal', 'Pop', 'Reggae', 'Rock']

mat_logistic = confusion_matrix(y_test_logistic, y_pred_logistic)
mat_svm_linear = confusion_matrix(y_test_svm_linear, y_pred_svm_linear)
mat_svm_rbf_balanced = confusion_matrix(y_test_svm_balanced, y_pred_svm_balanced)
mat_svm_rbf = confusion_matrix(y_test_svm_rbf, y_pred_svm_rbf)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

plt.figure(figsize=(8, 6))
sns.heatmap(mat_svm_linear.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - SVM (Linear Kernel)')
plt.show()

plt.figure(figsize=(8, 6))
sns.heatmap(mat_svm_rbf_balanced.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - SVM (RBF Kernel) Balanced')
plt.show()

plt.figure(figsize=(8, 6))
sns.heatmap(mat_svm_rbf.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - SVM (RBF Kernel) Weighted')
plt.show()