In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../data/data.csv")
df.columns

In [None]:
df.drop('Unnamed: 32', inplace=True, axis=1)
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.diagnosis.value_counts()

In [None]:
def count_plot(df:pd.DataFrame, column:str) -> None:
    plt.figure(figsize=(10, 8))
    sns.countplot(data=df, x=column)
    plt.xlabel(f'{column}', fontsize=17)
    plt.ylabel("Count", fontsize=17)
    plt.title(f'\nDistribution of {column}\n', size=20, fontweight='bold')
    plt.savefig('../charts/count_plot.jpg')
    plt.show()

In [None]:
count_plot(df,'diagnosis')

In [None]:
def encoding_data(df):
  for column in df.columns:
    if df[column].dtype == np.int64 or df[column].dtype == np.float64:
      continue
    df[column] = LabelEncoder().fit_transform(df[column])
  
  return df

In [None]:
encoded_df = df.copy()
encoding_data(encoded_df)

In [None]:
def corr_matrix(df,title:str,save_as):
    plt.figure(figsize=(25, 20))
    res=sns.heatmap(df.corr(), annot=True,fmt='.2f');
    res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 15)
    res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 15)
    plt.title(title,size=18, fontweight='bold')
    plt.savefig(f'../charts/{save_as}')
    plt.show

In [None]:
corr_matrix(encoded_df,'Correlation matrix of all the dataset','general_correlation.jpg')

In [None]:
def find_high_corr(df):
    high_corr= df.corr()
    high_corr_columns = high_corr.index[abs(high_corr['diagnosis'])>=0.5]
    
    return high_corr_columns

In [None]:
columns_for_analysis = find_high_corr(encoded_df)
corr_matrix(encoded_df[columns_for_analysis],'Correlation matrix for highly related features','higher_correlation.jpg')

In [None]:
def plot_ditribution(df,columns):
    for col in columns:
        sns.displot(df, x=col, hue="diagnosis",kind='kde',multiple='stack',palette=["red", "green"])
        plt.savefig('../charts/'+col+'_distribution.jpg')
        plt.show()

In [None]:
mean_col = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

plot_ditribution(df,mean_col)

In [None]:
def feature_vs_target(df,features, fields):
    fig, axs = plt.subplots(10,3, figsize=(20,45))
    for col in range(len(features)):  
        for f in range(len(fields)):  
            sns.histplot(df, 
                        x=features[col]+"_"+fields[f], 
                        hue="diagnosis", element="poly", 
                        stat="count", 
                        palette=["red", "green"],
                        ax=axs[col][f])

In [None]:
columns = ["radius", "texture", "perimeter", "area", "smoothness", "compactness", "concavity", "concave points", "symmetry", "fractal_dimension"]
fields = ["mean", "se", "worst"]
feature_vs_target(df,columns,fields)

In [None]:
def plot_outlier(df,columns):
    sns.set(style="darkgrid")
    data_frame = pd.melt(df, id_vars='diagnosis', value_vars=columns)
    plt.figure(figsize=(15, 8))
    res=sns.boxplot(x='variable', y='value',hue='diagnosis', data=data_frame,palette=["blue", "skyblue"])
    plt.title('Outliers in texture_mean and radius_mean \n', size=18, fontweight='bold')
    res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 15)
    res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 15)
    plt.show()

In [None]:
columns = ['texture_mean', 'radius_mean']
plot_outlier(df,columns)