In [None]:
import os
import math
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from PIL import Image

from collections import Counter
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency

In [None]:
pad_dataset = pd.read_csv('datasets/PAD_20_Metadata.csv')

In [None]:
pad_dataset.head()

In [None]:
pad_dataset.info()

In [None]:
pad_dataset['diagnostic'].value_counts().plot.bar(rot=0, title="Disease Distribution")

In [None]:
pad_dataset['patient_id'].value_counts(ascending=False)[:20]

In [None]:
(pad_dataset['patient_id'].value_counts() > 1).sum()

In [None]:
len(pad_dataset['patient_id'].unique())

In [None]:
for i in range(1, 11):
    count = (pad_dataset['patient_id'].value_counts() == i).sum()
    print(f'{i}: {count}')

In [None]:
len(pad_dataset['lesion_id'].unique())

In [None]:
pad_dataset['lesion_id'].value_counts(ascending=False)[:10]

In [None]:
(pad_dataset['lesion_id'].value_counts() > 1).sum()

In [None]:
for i in range(1, 11):
     count = (pad_dataset['patient_id'].value_counts() == i).sum()
     total_patients = len(pad_dataset['patient_id'])
     percentage = count / total_patients * 100
     print(f'{i} diseases | {count} cases | & {percentage:.2f}\%')

In [None]:
smoke_ill = len(
	pad_dataset[
		(pad_dataset['smoke'] == True) & 
		(pad_dataset['diagnostic'].isin(['MEL', 'ACK', 'SCC', 'BCC']))
	].dropna(subset=['smoke', 'diagnostic']))

no_smoke_ill = len(
	pad_dataset[
		(pad_dataset['smoke'] == False) & 
		(pad_dataset['diagnostic'].isin(['MEL', 'ACK', 'SCC', 'BCC']))
	].dropna(subset=['smoke', 'diagnostic'])
)

result = smoke_ill / (smoke_ill + no_smoke_ill)
result

In [None]:
cancer_types = ['MEL', 'ACK', 'SCC', 'BCC']

df = pad_dataset.dropna(subset=['smoke', 'diagnostic'])

total_smokers = len(df[df['smoke'] == True])
total_nonsmokers = len(df[df['smoke'] == False])

smokers_with_cancer = len(df[(df['smoke'] == True)  & (df['diagnostic'].isin(cancer_types))])
nonsmokers_with_cancer = len(df[(df['smoke'] == False) & (df['diagnostic'].isin(cancer_types))])

p_cancer_smokers = smokers_with_cancer / total_smokers
p_cancer_nonsmokers = nonsmokers_with_cancer / total_nonsmokers

p_cancer_smokers, p_cancer_nonsmokers

In [None]:
malignant_types = ['MEL','SCC','ACK']
benign_types = ['BCC','NEV','SEK']

df = pad_dataset.copy()

df['diagnostic'].isna().sum()

In [None]:
df['is_malignant'] = df['diagnostic'].apply(lambda x: 1 if x in malignant_types else 0)

In [None]:
print(df['is_malignant'].value_counts(normalize=False))
print(df['diagnostic'].value_counts())

In [None]:
tab = pd.crosstab(df['smoke'], df['is_malignant'])
display(tab)

p_smokers = tab.loc[True,1] / tab.loc[True].sum()
p_nonsmokers = tab.loc[False,1] / tab.loc[False].sum()
risk_diff = p_smokers - p_nonsmokers

print(f"p_smokers={p_smokers:.3f}, p_nonsmokers={p_nonsmokers:.3f}, risk_diff={risk_diff:.3f}")


In [None]:
df['age_bin'] = pd.cut(df['age'], bins=[0,30,45,60,120], labels=['<30','30-45','45-60','60+'])
df['age_bin'].value_counts(ascending=True)

In [None]:
def cramers_v(x, y):
    ct = pd.crosstab(x, y)
    chi2, p, dof, expected = chi2_contingency(ct)
    n = ct.sum().sum()
    k = min(ct.shape)-1
    return math.sqrt(chi2 / (n * k))

In [None]:
def cat_assoc_test(x, y='is_malignant', df=df):
    test_df = df.copy().dropna(subset=[x, y])
    test_df = test_df[test_df[x] != 'UNK']
    ct = pd.crosstab(test_df[x], test_df[y])
    display(ct)

    chi2, p, dof, expected = chi2_contingency(ct)

    print("Chi2:", chi2, "p:", p)

    n = ct.to_numpy().sum()
    k = min(ct.shape)-1
    cramers_v = math.sqrt(chi2 / (n * k))
    print(f"Cramer V: {cramers_v:.3f}")

In [None]:
cat_assoc_test('smoke')

In [None]:
for col in ['age_bin', 'smoke', 'drink', 'bleed', 'itch', 'changed']:
    print(f"\n{'='*50}")
    print(f"Testing: {col}")
    print(f"{'='*50}")
    cat_assoc_test(col)

In [None]:
cats = ['is_malignant', 'smoke', 'drink', 'bleed', 'itch', 'changed']
n = len(cats)
mat = np.zeros((n,n))

for i, a in enumerate(cats):
    for j, b in enumerate(cats):
        mat[i,j] = cramers_v(df[a], df[b])



fig, ax = plt.subplots(figsize=(8,6))
im = ax.imshow(mat, interpolation='nearest')
ax.set_xticks(range(n)); ax.set_yticks(range(n))
ax.set_xticklabels(cats, rotation=45, ha='right')
ax.set_yticklabels(cats)



for i in range(n):
    for j in range(n):
        ax.text(j, i, f"{mat[i,j]:.2f}", ha='center', va='center', fontsize=8)

plt.colorbar(im, ax=ax)
plt.tight_layout()
plt.show()

In [None]:
def entropy(series):
    probs = series.value_counts(normalize=True)
    
    return -(probs * np.log2(probs)).sum()

def conditional_entropy(x, y):
    dfxy = pd.concat([x, y], axis=1)
    total = len(dfxy)
    ce = 0.0
    for _, sub in dfxy.groupby(y.name):
        p = len(sub) / total
        ce += p * entropy(sub.iloc[:,0])
    return ce

def theils_u(x, y):
    hx = entropy(x)
    h_x_given_y = conditional_entropy(x, y)

    return (hx - h_x_given_y) / hx

results = []

for s in ['bleed', 'itch', 'grew', 'hurt', 'changed', 'elevation']:
    test_df = df.copy().dropna(subset=[s, 'is_malignant'])
    test_df = test_df[test_df[s] != 'UNK']

    v = cramers_v(test_df[s], test_df['is_malignant'])
    u = theils_u(test_df[s], test_df['is_malignant'])
    results.append({'symptom': s, 'cramers_v': v, 'theils_u': u})

pd.DataFrame(results).sort_values('cramers_v', ascending=False)

In [None]:
lr_model = LogisticRegression()

train_df = df.copy()
train_col = ['bleed', 'itch', 'grew', 'hurt', 'changed', 'elevation', 'smoke', 'drink', 'cancer_history', 'gender']

train_df = df.copy().dropna(subset=train_col + ['is_malignant']).drop(columns=['background_father', 'background_mother'])

In [None]:
for col in train_col:
	train_df[col] = train_df[col].map({True: 1, False: 0, 'True': 1, 'False': 0, 'FEMALE': 0, 'MALE': 1})

train_df.isna().sum(), train_df['grew'].value_counts()

In [None]:
def analyze_image_sizes(image_dir, sample_size=1000):
    files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))]
    files = files[:sample_size]
    
    sizes = []
    for f in files:
        img = Image.open(os.path.join(image_dir, f))
        sizes.append(img.size)
    
    size_counts = Counter(sizes)
    print("Najczęstsze rozdzielczości:")
    for size, count in size_counts.most_common(10):
        print(f"  {size[0]}x{size[1]}: {count} obrazów")
    
    widths = [s[0] for s in sizes]
    heights = [s[1] for s in sizes]
    print(f"\nStatystyki szerokości: min={min(widths)}, max={max(widths)}, średnia={np.mean(widths):.0f}")
    print(f"Statystyki wysokości: min={min(heights)}, max={max(heights)}, średnia={np.mean(heights):.0f}")

    return zip(widths, heights)

In [None]:
results = analyze_image_sizes("datasets/PAD-UFES-20/images")