<a href="https://www.kaggle.com/code/fahmikazimd/cs-federated-learning?scriptVersionId=172011814" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# **Data Loading**

In [None]:
data = pd.read_csv('../input/drug-consumptions-uci/Drug_Consumption.csv')
data = data.drop('ID', axis=1)
data.head()

# **Data Cleaning and Preprocessing**

# *Data Cleaning*
In the description of data, we are told that Semer is a fake drug that is used as a control. Since it is not a real drug those who claimed to have used Semer are assumed to be over claimers. We can not be sure these individuals have accurately recounted their drug use, therefore, we will remove these individuals from the data frame.


In [None]:
print(f'Original shape of data with {data.shape[0]} rows and {data.shape[1]} columns')
# Removing Overclaimers
data.query("Semer != 'CL0'") 
data = data.drop(data[data['Semer'] != 'CL0'].index)
print(f'Shape of data without overclaimers with {data.shape[0]} rows and {data.shape[1]} columns')

# *Encoding*

Encoding the features as the following: 

* Age:
    * 0 = 18-24
    * 1 = 25-34
    * 2 = 35-44
    * 3 = 45-54
    * 4 = 55-64
    * 5 = 65+
* Gender:
    * 0 = F
    * 1 = M
* Education:
    * 0 = Left school before 16 years
    * 1 = Left school at 16 years
    * 2 = Left school at 17 years
    * 3 = Left school at 18 years
    * 4 = Some college or university, no certificate or degree
    * 5 = Professional certificate/ diploma
    * 6 = University degree
    * 7 = Masters degree
    * 8 = Doctorate degree
* Country:
    * 0 = Australia
    * 1 = Canada
    * 2 = New Zealand
    * 3 = Other
    * 4 = Republic of Ireland
    * 5 = UK
    * 6 = USA
* Ethincity:
    * 0 = Asian
    * 1 = Black
    * 2 = Mixed-Black/Asian
    * 3 = Mixed-White/Asian
    * 4 = Mixed-White/Black
    * 5 = Other
    * 6 = White
* Drug Use:
    * 0 = never used the drug
    * 1 = used it over a decade ago
    * 2 = in the last decade
    * 3 = used in the last year
    * 4 = used in the last month
    * 5 = used in the last week
    * 6 = used in the last day

In [None]:
def encode(df, column, encoding): 
    df = df.copy()
    df[column] = df[column].apply(lambda x: encoding[x]).astype(np.int64)
    return df

In [None]:
# Encoding Age
age_encoding = {
    '18-24': 0, 
    '25-34': 1, 
    '35-44': 2, 
    '45-54': 3, 
    '55-64': 4, 
    '65+': 5
}

data = encode(data, 'Age', age_encoding)

In [None]:
# Encoding Gender
gender_encoding = {
    'M': 1,
    'F': 0
}
data = encode(data, 'Gender', gender_encoding)

In [None]:
# Encoding Education
education_encoding = {
    'Left school before 16 years': 0, 
    'Left school at 16 years': 1, 
    'Left school at 17 years': 2, 
    'Left school at 18 years': 3,
    'Some college or university, no certificate or degree': 4,
    'Professional certificate/ diploma': 5,
    'University degree': 6,
    'Masters degree': 7,
    'Doctorate degree': 8
}
data = encode(data, 'Education', education_encoding)

In [None]:
# Encoding Country
country_encoding = {
    'Australia': 0,
    'Canada': 1,
    'New Zealand': 2,
    'Other': 3,
    'Republic of Ireland': 4,
    'UK': 5,
    'USA': 6
}
data = encode(data, 'Country', country_encoding)

In [None]:
# Encoding Ethnicity
ethnicity_encoding = {
    'Asian': 0,
    'Black': 1,
    'Mixed-Black/Asian': 2,
    'Mixed-White/Asian': 3,
    'Mixed-White/Black': 4,
    'Other': 5,
    'White': 6
}
data = encode(data, 'Ethnicity', ethnicity_encoding)

In [None]:
# Encoding Drug use
drugs = ['Alcohol',
         'Amyl',
         'Amphet',
         'Benzos',
         'Choc',
         'Caff',
         'Cannabis',
         'Coke',
         'Crack',
         'Ecstasy',
         'Heroin',
         'Ketamine',
         'Legalh',
         'LSD',
         'Meth',
         'Mushrooms',
         'Nicotine',
         'VSA'    ]
drug_use_encoding = {
    'CL0': 0,
    'CL1': 0,
    'CL2': 1,
    'CL3': 1,
    'CL4': 1,
    'CL5': 1,
    'CL6': 1
}
for drug in drugs:
    data = encode(data, drug, drug_use_encoding)

In [None]:
data.head()

# *Removing Unnecessary Columns*

In [None]:
# Removing overclaimer signifying columns
data = data.drop(['Semer'], axis=1)
data = data.reset_index(drop=True)

Since Coke and Crack are both just different types of Cocaine, we merge them together and remove the original Coke and Crack columns

In [None]:
# Merging the Coke and Crack columns
data['both_user'] = data[['Coke', 'Crack']].iloc[:].sum(axis=1)
data['Cocaine'] = data['both_user'].apply(lambda x: 1 if x > 0 else 0)
data = data.drop(['both_user', 'Coke', 'Crack'], axis=1)

In [None]:
# Finding the illegal drug with most data
illegal_drugs = ['Amyl', 'Amphet','Cocaine', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'VSA']
highest_used_drug = 'Amyl'
highest_count = 0
for drug in illegal_drugs:
    users = data[drug].sum()
    if users > highest_count:
        highest_used_drug = drug
        highest_count = users

print(highest_used_drug, highest_count)

In [None]:
corr = data.corr(numeric_only=True)
# print(corr['Legalh'])
low_corr = []
for key, value in corr['Legalh'].items():
    if abs(value) < 0.30:
        low_corr.append(key)
print(low_corr)

In [None]:
rem = ['Education', 'Country', 'Ethnicity', 'Alcohol','Caff', 'Choc'] # excluding the score values and other drugs
for column in rem:
    data = data.drop(column, axis=1)
# corr = data.corr(numeric_only=True)
# print(corr['Legalh'])

In [None]:
data.head()

# *Creating Centers and Preprocessing*

In [None]:
print("Size:" ,data.shape[0])
data.sample(frac=1, random_state=1)
centers = {}
for i in range(4):
    center_name = 'center_' + str(i+1)
    centers[center_name] = data.iloc[i*len(data) // 4: (i+1)*len(data) // 4]

print("Center 1 size:", centers["center_1"].shape[0])
print("Center 2 size:", centers["center_2"].shape[0])
print("Center 3 size:", centers["center_3"].shape[0])
print("Center 4 size:", centers["center_4"].shape[0])

In [None]:
for center, df in centers.items():
    # seperating label from features
    y = df['Legalh']
    X = df.drop('Legalh', axis=1)
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), 
                           index=X_train.index, 
                           columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), 
                          index=X_test.index, 
                          columns=X_test.columns)
    centers[center] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }

In [None]:
centers['center_1']['X_train'].head()

In [None]:
centers['center_2']['X_train'].head()

In [None]:
centers['center_3']['X_train'].head()

In [None]:
centers['center_4']['X_train'].head()

# **Training**

# *Function to show the results*

In [None]:
def validate(models, centers):
    row = 0
    fig, axes = plt.subplots(nrows=len(centers.keys()), ncols=2, figsize=(12, 32))
    
    for center in centers.keys():
        X_test = centers[center]['X_test']
        y_test = centers[center]['y_test']
        
        concerned_models = {
            center: models[center], 
            'FedAvg': models['FedAvg'], 
            'FedProx': models['FedProx'],
            'FedAdam': models['FedAdam']
        }
        
        accuracy_values = {}  # Dictionary to store accuracy values for each model
        f1_score_values = {}  # Dictionary to store F1 score values for each model
        for model in concerned_models.keys():
            
            # Finding Accuracy
            yhat = concerned_models[model].predict(X_test)
            acc = accuracy_score(y_test, yhat)
            accuracy_values[model] = acc * 100
            
            # Finding F1 Score
            yhat = concerned_models[model].predict(X_test)
            f1 = f1_score(y_test, yhat, pos_label=1)
            f1_score_values[model] = f1
            
        print("Accuracy:", accuracy_values)
        print("F1 scores:", f1_score_values)
        model_names = list(concerned_models.keys())
        x = np.arange(len(model_names))
        
        # Plot Accuracy
        ax1 = axes[row][0]
        ax1.bar(x, accuracy_values.values(), label='Accuracy', color='blue', alpha=0.7)
        ax1.set_xticks(x)
        ax1.set_xticklabels(model_names, rotation=45, ha='right')
        ax1.set_ylabel('Accuracy')
        ax1.set_title(f'Accuracy and F1 Score for Different Models on the test set of {center}')
        
        # Plot F1 Score
        ax2 = axes[row][1]
        ax2.bar(x, f1_score_values.values(), label='F1 Score', color='green', alpha=0.7)
        ax2.set_xticks(x)
        ax2.set_xticklabels(model_names, rotation=45, ha='right')
        ax2.set_ylabel('F1 Score')
        row +=1
    
    plt.tight_layout()
    plt.show()
    
    

# *Logistic Regression*

In [None]:
# Aggregation functions

def FedAvgLogistic(models):
    # Initialize the global model with the first model from the dictionary
    global_model = models[list(models.keys())[0]]
    
    # Loop through the other models and add their parameters to the global model
    for center, model in models.items():
        if model != global_model:
            global_model.coef_ += model.coef_
            global_model.intercept_ += model.intercept_
    
    # Compute the average
    global_model.coef_ /= len(models)
    global_model.intercept_ /= len(models)
    
    return global_model

def FedProxLogistic(models, learning_rate=0.01, regularization_strength=0.1):
    # Use the first model from the dictionary as the global model
    global_model = models[list(models.keys())[0]]
    
    # Apply proximal gradient descent update
    for center, model in models.items():
        if model != global_model:
            gradient = model.coef_  # Gradient of the local model
        
            # Apply proximal gradient descent update
            global_model.coef_ -= learning_rate * (gradient + regularization_strength * global_model.coef_)
            global_model.intercept_ -= learning_rate * (model.intercept_ + regularization_strength * global_model.intercept_)
    
    return global_model

def FedAdamLogistic(models, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7, regularization_strength=0.1):
    # Initialize moment estimates for Adam
    m = np.zeros(models[list(models.keys())[0]].coef_.shape)
    v = np.zeros(models[list(models.keys())[0]].coef_.shape)
    t = 0
    
    # Use the first model from the dictionary as the global model
    global_model = models[list(models.keys())[0]]
    
    for center, model in models.items():
        t += 1
        gradient = model.coef_  # Gradient of the local model
        
        # Update moment estimates
        m = beta1 * m + (1 - beta1) * gradient
        v = beta2 * v + (1 - beta2) * (gradient ** 2)
        
        # Bias correction
        m_hat = m / (1 - beta1 ** t)
        v_hat = v / (1 - beta2 ** t)
        
        # Check if the model is not the same as the global model before updating
        if model != global_model:
            # Update the global model using Adam
            global_model.coef_ -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
            global_model.intercept_ -= learning_rate * (model.intercept_ + regularization_strength * global_model.intercept_)
    
    return global_model

In [None]:
# Training the models
logistic_regression_models = {}
for center in centers.keys():
    X_train = centers[center]['X_train']
    y_train = centers[center]['y_train']
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    logistic_regression_models[center] = model

In [None]:
# Applying Federated learning
temp_logistic = {key: value for key, value in logistic_regression_models.items()}

logistic_regression_models['FedAvg'] = FedAvgLogistic(temp_logistic)
logistic_regression_models['FedProx'] = FedProxLogistic(temp_logistic)
logistic_regression_models['FedAdam'] = FedAdamLogistic(temp_logistic)

In [None]:
validate(logistic_regression_models, centers)

# *Naive Bayes*

In [None]:
def FedAvgNB(models):
    global_model = models[list(models.keys())[0]]
    for center in models.keys():
        if center != list(models.keys())[0]:
            global_model.class_prior_ += models[center].class_prior_
            global_model.theta_ += models[center].theta_
    global_model.class_prior_ /= len(models)
    global_model.theta_ /= len(models)
    return global_model

def FedProxNB(models, learning_rate=0.01, penalty=0.01):
    global_model = models[list(models.keys())[0]]
    for center in models.keys():
        if center != list(models.keys())[0]:
            global_model.class_prior_ = (1 - learning_rate * penalty) * global_model.class_prior_ + learning_rate * models[center].class_prior_
            global_model.theta_ = (1 - learning_rate * penalty) * global_model.theta_ + learning_rate * models[center].theta_
    return global_model


def FedAdamNB(models, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    global_model = models[list(models.keys())[0]]
    m = [0] * len(global_model.class_prior_)
    v = [0] * len(global_model.class_prior_)
    t = 0
    for center in models.keys():
        if center != list(models.keys())[0]:
            t += 1
            for layer in range(len(global_model.class_prior_)):
                m[layer] = beta1 * m[layer] + (1 - beta1) * models[center].class_prior_[layer]
                v[layer] = beta2 * v[layer] + (1 - beta2) * (models[center].class_prior_[layer] ** 2)
                m_hat = m[layer] / (1 - beta1 ** t)
                v_hat = v[layer] / (1 - beta2 ** t)
                global_model.class_prior_[layer] -= (
                    learning_rate * m_hat / (epsilon + v_hat ** 0.5)
                )
                global_model.theta_[layer] -= (
                    learning_rate * m_hat / (epsilon + v_hat ** 0.5)
                )
    
    return global_model  

In [None]:
# Training the models

naive_bayes_models = {}

for center in centers.keys():
    X_train = centers[center]['X_train']
    y_train = centers[center]['y_train']
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    naive_bayes_models[center] = naive_bayes

In [None]:
# Applying Federated learning
temp_nbm = {key: value for key, value in naive_bayes_models.items()}

naive_bayes_models['FedAvg'] = FedAvgNB(temp_nbm)
naive_bayes_models['FedProx'] = FedProxNB(temp_nbm)
naive_bayes_models['FedAdam'] = FedAdamNB(temp_nbm)

In [None]:
validate(naive_bayes_models, centers)

# *K means Clustering*

In [None]:
# Trying to find number of clusters

for center in centers.keys():
    X_train = centers[center]['X_train']
    y_train = centers[center]['y_train']
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_train)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='viridis') 
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('PCA - Cluster Visualization')
    plt.show()

In [None]:
# Aggregation Functions

def FedAvgKMeans(models):
    # Initialize the global model with the first model from the dictionary
    global_model = models[list(models.keys())[0]]
    
    # Loop through the other models and add their cluster centers to the global model
    for country, model in models.items():
        if model != global_model:
            global_model.cluster_centers_ += model.cluster_centers_
    
    # Compute the average
    global_model.cluster_centers_ /= len(models)
    
    return global_model

def FedProxKMeans(models, learning_rate=0.01, regularization_strength=0.1):
    # Use the first model from the dictionary as the global model
    global_model = models[list(models.keys())[0]]
    
    # Apply proximal gradient descent update
    for country, model in models.items():
        if model != global_model:
            gradient = model.cluster_centers_  # Gradient of the local model
        
            # Apply proximal gradient descent update
            global_model.cluster_centers_ -= learning_rate * (gradient + regularization_strength * global_model.cluster_centers_)
    
    return global_model

def FedAdamKMeans(models, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7, regularization_strength=0.1):
    # Initialize moment estimates for Adam
    m = np.zeros(models[list(models.keys())[0]].cluster_centers_.shape)
    v = np.zeros(models[list(models.keys())[0]].cluster_centers_.shape)
    t = 0
    
    # Use the first model from the dictionary as the global model
    global_model = models[list(models.keys())[0]]
    
    for country, model in models.items():
        t += 1
        gradient = model.cluster_centers_  # Gradient of the local model
        
        # Update moment estimates
        m = beta1 * m + (1 - beta1) * gradient
        v = beta2 * v + (1 - beta2) * (gradient ** 2)
        
        # Bias correction
        m_hat = m / (1 - beta1 ** t)
        v_hat = v / (1 - beta2 ** t)
        
        # Check if the model is not the same as the global model before updating
        if model != global_model:
            # Update the global model using Adam
            global_model.cluster_centers_ -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
    
    return global_model

In [None]:
# Training the models
kmeans_models = {}

for country in centers.keys():
    X_train = centers[country]['X_train']
    y_train = centers[country]['y_train']
    
    # Train K means clustering model
    kmeans = KMeans(n_clusters=2, n_init=10, random_state=1)  # You can adjust the number of clusters
    kmeans.fit(X_train)
    kmeans_models[country] = kmeans

In [None]:
# Applying Federated learning
temp_kmeans = {key: value for key, value in kmeans_models.items()}

kmeans_models['FedAvg'] = FedAvgKMeans(temp_kmeans)
kmeans_models['FedProx'] = FedProxKMeans(temp_kmeans)
kmeans_models['FedAdam'] = FedAdamKMeans(temp_kmeans)

In [None]:
validate(kmeans_models, centers)