<a href="https://colab.research.google.com/github/GuidoGiacomoMussini/SocialNetworkAnalysis--Algorithm/blob/main/SNA_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **0 - Initialistion**

## 0.1 - Libraries

In [1]:
import os
import shutil
from glob import glob

import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
%matplotlib inline
import random

from PIL import Image

import random as rnd
from IPython.display import clear_output 
from tqdm.notebook import tqdm_notebook
from collections import defaultdict, Counter, OrderedDict
import torch
import torchvision.transforms as T
import networkx as nx

import seaborn as sns

from sklearn import metrics
from sklearn.metrics import classification_report


## 0.2 - Functions

In [2]:
def re_index(data):
  s = pd.Series(range(len(data)))
  data = data.set_index(s)
  return data

In [3]:
def tensor_mean(tens):
  ex = tens.numpy()
  colmean = ex.mean(axis=1)
  totmean = colmean.mean()
  return totmean


In [4]:
#https://discuss.pytorch.org/t/how-to-show-a-image-in-jupyter-notebook-with-pytorch-easily/1229/4?u=ataraxy
def show(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)), interpolation='nearest')

In [5]:
def metrics_definition(net_train, y_train):
  links_list, nodes_list, density_list, centrality_list, clust_list, transitivity_list, label = [], [], [], [], [], [], []

  for i in tqdm_notebook(range(len(net_train))):
    dic =nx.degree_centrality(net_train[i]) 
    dic = list(dic.values())
    centrality_list.append(np.mean(dic))
    links_list.append(net_train[i].number_of_edges())
    nodes_list.append(net_train[i].number_of_nodes())
    density_list.append(nx.density(net_train[i]))
    clust_list.append(nx.average_clustering(net_train[i]))
    transitivity_list.append(nx.transitivity(net_train[i]))

  train_df = pd.DataFrame( columns = ['links', 'nodes', 'density', 'centrality', 'clustering', 'transitivity'])
  train_df['links'] = links_list
  train_df['nodes'] = nodes_list
  train_df['density'] = density_list
  train_df['centrality'] = centrality_list
  train_df['clustering'] = clust_list
  train_df['transitivity'] = transitivity_list
  train_df['label'] = y_train
  return train_df

In [6]:
def alg_mean(train_df, label1, label2):

  avg_links_0, avg_nodes_0, avg_density_0, avg_centrality_0, avg_clust_0, avg_transitivity_0 = 0,0,0,0,0,0
  avg_links_1, avg_nodes_1, avg_density_1, avg_centrality_1, avg_clust_1, avg_transitivity_1 = 0,0,0,0,0,0

  avg_links_0 = train_df.loc[train_df['label'] == label1, 'links'].mean()
  avg_nodes_0 = train_df.loc[train_df['label'] == label1, 'nodes'].mean()
  avg_density_0 = train_df.loc[train_df['label'] == label1, 'density'].mean()
  avg_centrality_0 = train_df.loc[train_df['label'] == label1, 'centrality'].mean()
  avg_clust_0 = train_df.loc[train_df['label'] == label1, 'clustering'].mean()
  avg_transitivity_0 = train_df.loc[train_df['label'] == label1, 'transitivity'].mean()

  avg_links_1 = train_df.loc[train_df['label'] == label2, 'links'].mean()
  avg_nodes_1 = train_df.loc[train_df['label'] == label2, 'nodes'].mean()
  avg_density_1 = train_df.loc[train_df['label'] == label2, 'density'].mean()
  avg_centrality_1 = train_df.loc[train_df['label'] == label2, 'centrality'].mean()
  avg_clust_1 = train_df.loc[train_df['label'] == label2, 'clustering'].mean()
  avg_transitivity_1 = train_df.loc[train_df['label'] == label2, 'transitivity'].mean()

  tr_dist_0 = (avg_links_0, avg_nodes_0, avg_density_0, avg_centrality_0, avg_clust_0, avg_transitivity_0)
  tr_dist_1 = (avg_links_1, avg_nodes_1, avg_density_1, avg_centrality_1, avg_clust_1, avg_transitivity_1)

  return tr_dist_0, tr_dist_1

In [7]:
def array_to_tuple(test_df):
  tpl_list = []

  for i in range(len(test_df)):
    tpl = test_df.loc[i]
    tpl_list.append((tpl[0], tpl[1], tpl[2], tpl[3], tpl[4], tpl[5]))
  return tpl_list

In [8]:
def prediction(tpl_list, label1, label2):
  predicted = []
  for i in range(len(tpl_list)): 
    one = math.dist(tr_dist_1, tpl_list[i])
    zero = math.dist(tr_dist_0, tpl_list[i])
    if one > zero: 
      predicted.append(label1)
    else:
      predicted.append(label2)
  return predicted

## 0.3 - Import the Data From Kaggle 
*(estimated time: 4 minutes)*

In [None]:
!mkdir WD

! #mkdir ~/.kaggle
! #cp kaggle.json ~/.kaggle/
! #chmod 600 ~/.kaggle/kaggle.json
os.environ['KAGGLE_USERNAME'] = "guidomussini"
os.environ['KAGGLE_KEY'] = "f7b24d630bc3e7e7fda7a5a1b32f4582"
! kaggle datasets download -d kmader/skin-cancer-mnist-ham10000
! unzip /content/skin-cancer-mnist-ham10000.zip -d /content/WD

#remove useless data 
shutil.rmtree('/content/WD/ham10000_images_part_1')
shutil.rmtree('/content/WD/ham10000_images_part_2')
! rm '/content/WD/hmnist_28_28_L.csv'
! rm '/content/WD/hmnist_28_28_RGB.csv'
! rm '/content/WD/hmnist_8_8_L.csv'
! rm '/content/WD/hmnist_8_8_RGB.csv'
! rm '/content/skin-cancer-mnist-ham10000.zip'

# **1 - Dataset Definition**

##1.1 - Merge the images from the 2 folders

*   Code taken by: https://www.kaggle.com/code/sid321axn/step-wise-approach-cnn-model-77-0344-accuracy

In [50]:
#Merge the images of the 2 folders
base_skin_dir = os.path.join('..', 'content/WD')
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

sdf = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))


create a column in which each row contain the path to a image

In [51]:
sdf = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))

sdf['path'] = sdf['image_id'].map(imageid_path_dict.get)

## 1.2 - Metadata Handling 

check for duplicates

In [52]:
sdf = sdf.drop_duplicates(subset=['lesion_id']) 
sdf = sdf.drop_duplicates(subset=['image_id']) 

print(sdf.shape[0]) #now i have 7470 lesions 

7470


check for missing values

In [53]:
print("number of non-NA data per columns:\n",sdf.isnull().sum())
#some NA in age -> since that column will be removed from the dataset, i don't impute them

number of non-NA data per columns:
 lesion_id        0
image_id         0
dx               0
dx_type          0
age             52
sex              0
localization     0
path             0
dtype: int64


## 1.3 - Create the binary label:


*   **M**: If the lesion is Malignant (0)
*   **B**: If the lesion is Benign (1)



In [54]:
#Define Benign and Malignant lesions
Benign = ["nv", "bkl", "vasc", "df"]
Malignant = ["mel", "bcc", "akiec"]
m1 = sdf['dx'].isin(Benign)
m2 = sdf['dx'].isin(Malignant)
sdf['type'] = np.select([m1, m2], ["B", "M"], default=sdf['dx'])

In [55]:
sdf['label'] = pd.Categorical(sdf['type']).codes
sdf = sdf[['path', 'label']]

#this function, defined in the section 'functions', fix the row-indeces of the dataset
#since the download (and in the next steps the sampling), save them in a unconvinient manner
sdf = re_index(sdf)

## 1.4 - Split the Dataset in train and test set


In [56]:
#train
train = sdf.sample(frac=0.7, random_state=19)
#test
test = sdf.drop(train.index)

#re index the df
train = re_index(train)
test = re_index(test)

## 1.5 - Explicit the images as tensor 
*(estimated time: 3 minutes)*

In [57]:
sh_x, sh_y = 64, 64 #define the dimensions of the images

train['image'] = train['path'].map(lambda x: np.asarray(Image.open(x).resize((sh_x,sh_y))))
test['image'] = test['path'].map(lambda x: np.asarray(Image.open(x).resize((sh_x,sh_y))))

#remove useless columns from the dataset
train = train[['image', 'label']]
test = test[['image', 'label']]

In [None]:
show(torch.tensor(train['image'][3]).permute(2,0,1))

# **2 - Trasform the Images**

## 2.1 - Apply center crop and grayscale filter to the images


*   Center Crop to focus only on the lesion
*   Grayscale to reduce the tensor dimensionality, deleting the 'color' dimension. This passage is necessary to build the A matrix



In [19]:
crop_dim = int(round(sh_x*0.7, 0)) #define the reduction of dimensionality by cropping the images

transform1 = T.CenterCrop(size= crop_dim)
transform2 = T.Grayscale()
x_train = train["image"].map(lambda x: transform2(transform1(torch.tensor(x).permute(2,0,1))))
x_test = test["image"].map(lambda x: transform2(transform1(torch.tensor(x).permute(2,0,1))))

y_train = train[['label']] 
y_test = test[['label']]


In [None]:
show(x_train[3])

## 2.2 - Apply a step function to obtain white-black images
*(estimated time: 17 minutes)*

Threshold set equal to the averege color level of the given image (see 'tensor_mean()' function)

*  if pixel > tensor_mean --> black (255)
*  if pixel < tensor_mean --> white (0)

this passage has a primarily visual purpose, in the next chunks the trasformation will be 1 if black, 0 if white



In [None]:
#trasform x_ train in 255-0 form
for tens in tqdm_notebook(range(len(x_train))): 
  ex = x_train[tens]
  mean = tensor_mean(ex)
  for h in range(ex.shape[1]): 
    for l in range(ex.shape[2]):

      if h == l: 
        ex[0][h][l] = 0
      elif ex[0][h][l] > mean: 
        ex[0][h][l] = 0 

  x_train[tens] = ex

In [None]:
show(torch.tensor(x_train[3]))

same procedure applyed to the test set. 

*(estimated time: 7 miutes)*

In [None]:
#trasform x_test in 255-0 form
for tens in tqdm_notebook(range(len(x_test))): 
  ex = x_test[tens]
  mean = tensor_mean(ex)
  for h in range(ex.shape[1]): 
    for l in range(ex.shape[2]):

      if h == l: 
        ex[0][h][l] = 0
      elif ex[0][h][l] > mean: 
        ex[0][h][l] = 0 

  x_test[tens] = ex

reshape the images to obtain 2D matrices

In [24]:
for i in range(len(x_train)):
  x_train[i] = x_train[i].reshape(crop_dim, crop_dim).numpy()

for i in range(len(x_test)):
  x_test[i] = x_test[i].reshape(crop_dim, crop_dim).numpy()

## 2.4 - Save the netowrks

create the network lists for train and test set and remove isolated nodes

In [25]:
net_train = []
net_test = []

for i in range(len(x_train)):
  provnet = nx.from_numpy_array(x_train[i], parallel_edges=False, create_using=None)
  provnet.remove_nodes_from(list(nx.isolates(provnet)))
  net_train.append(provnet)

for i in range(len(x_test)):
  provnet = nx.from_numpy_array(x_test[i], parallel_edges=False, create_using=None)
  provnet.remove_nodes_from(list(nx.isolates(provnet)))
  net_test.append(provnet)

In [None]:
sns.heatmap(x_train[3])

In [None]:
nx.draw(net_train[3])

# **3 - The Algorithm**

In [28]:
# Number of Links --> net.number_of_nodes()
# Number of Nodes --> net.number_of_edges()
# Density --> density(net)
# Normalized centrality mean(dict) | dict = nx.degree_centrality(net)
# GLobal clustering coefficient
# Transitivity

## 3.1 - Create the Network-metrics dataset. 

*(estimated time: 3 minutes)*

The metrics that will be calculated are:
*   Number of nodes
*   Number of links
*   Density
*   Centrality
*   Average clustering coefficient




Train set 

In [None]:
train_df = metrics_definition(net_train, y_train)
test_df = metrics_definition(net_test, y_train)

Remove NaN from the datasets

In [30]:
train_df = train_df.dropna()
test_df = test_df.dropna()

## 3.2 - train the algorithm

In [31]:
tr_dist_0, tr_dist_1 = alg_mean(train_df, 0, 1)
te_dist_0, te_dist_1 = alg_mean(test_df, 0, 1)

transoform the results in tuples

In [32]:
tpl_te_list = array_to_tuple(test_df)

## 3.3 - Prediction

In [33]:
predict_te = prediction(tpl_te_list, 0, 1)

results

In [None]:
ytest = test_df['label']

cnf_test = metrics.confusion_matrix(ytest, predict_te)
report_te = metrics.classification_report(ytest, predict_te)


print("----------------------------TEST--------------------------------\n")
print(report_te, "\n Confusion matrix \n", cnf_test)


# **Synthetic images generator**

generate synthetic images based on ER random graph theory

In [35]:
def SN_random(mat):
  colmean = mat.mean(axis=1)
  mu = colmean.mean()

  rnd = np.zeros((mat.shape[0],mat.shape[1]))
  net = nx.from_numpy_array(mat, parallel_edges=False, create_using=None)
  net.remove_nodes_from(list(nx.isolates(net)))
  p_one = nx.density(net)
  for i in range(mat.shape[0]):
    for j in range(mat.shape[1]):

      if mat[i][j] == 0:
        rnd[i][j] == 0
      elif p_one <= random.uniform(0, 1):
        rnd[i][j] =random.randint(round(mat.max()-mu, 0), mat.max())
      else: 
        rnd[i][j] = mat[i][j]

  return rnd

In [36]:
plt1 = x_train[3]
plt2 = SN_random(plt1)

In [None]:
sns.heatmap(plt1)

In [None]:
sns.heatmap(plt2)