In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pickle

In [2]:
# Import The data
raw = pd.read_csv('features.txt', index_col = 0)
features = pd.read_csv('labels.txt', index_col = 0)

### Data Exploration

In [3]:
def explore(df):
    """
    This function is used to explore a given pandas DataFrame and print out information such as the number of duplicated 
    rows, the shape of the DataFrame, the number of columns with NaNs, any rows containing NaNs, general information 
    about the DataFrame, and (optionally) descriptive statistics for numerical and object columns.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The DataFrame to be explored.
        
    Returns:
    --------
    None
    """
    # Display the first two rows of the DataFrame
    display(df.head(2))
    
    # Calculate and print the number of duplicated rows in the DataFrame
    print("Number of duplicated rows: ", df.duplicated().sum())
    
    # Print the shape of the DataFrame
    print("Data shape", df.shape)
    
    # Identify any columns with NaNs and print the number of columns with NaNs
    cols_with_nans = df.columns[df.isna().any()].tolist()
    print('Number of columns with NaNs:', len(cols_with_nans))
    
    # If there are NaNs, display the rows that contain them in those columns
    if len(cols_with_nans) > 0:
        print('Data with NaNs:')
        display(df[cols_with_nans][df[cols_with_nans].isnull().any(axis=1)])
    
    # Display general information about the DataFrame
    display(df.info())
    
    # (Optional) Display descriptive statistics for numerical columns
    #display(df.describe())
    
    # (Optional) Display descriptive statistics for object (i.e., non-numeric) columns
    #display(df.describe(include='object'))


In [4]:
explore(raw)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,57832,57846,57885,57887,57908,57929,57932,57935,57954,57969
SRR1146243,4.057059,2.805784,3.935262,3.866202,2.58552,2.244047,8.248893,5.426924,5.124292,4.481451,...,3.363534,4.917973,1.701393,0.283353,2.439566,2.496922,2.379835,2.538491,3.165511,1.947549
SRR1146244,3.318046,-2.362307,3.907833,3.333466,2.263786,2.973915,5.948596,4.329128,4.749798,4.002509,...,2.807221,4.533203,-1.493196,1.191666,2.593325,0.309551,1.735091,1.758127,4.482234,-0.255909


Number of duplicated rows:  0
Data shape (177, 15326)
Number of columns with NaNs: 11
Data with NaNs:


Unnamed: 0,286,287,288,289,290,291,292,294,295,296,297
SRR1146130,,,,,,,,,,,
SRR1146131,,,,,,,,,,,
SRR1146132,,,,,,,,,,,
SRR1146133,,,,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
Index: 177 entries, SRR1146243 to SRR1146086
Columns: 15326 entries, 1 to 57969
dtypes: float64(15321), int64(5)
memory usage: 20.7+ MB


None

There are 4 rows with missing 11 missing values, The dataset isnt big so I will impute the missing values with each column median

In [5]:
# Check if the target is balanced
features['target'].value_counts()

lesional    94
normal      83
Name: target, dtype: int64

The target is balanced so Accuracy will be good metric

### Data preprocessing

In [6]:
# preprocess the target to be numerical
features['target'] = features['target'].map({'lesional':1, 'normal':0})

In [7]:
def select_high_variance_features(df, threshold=0.05):
    """
    Selects features with high variance from a Pandas DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame to select features from.
    - threshold (float): The minimum variance a feature must have to be kept.

    Returns:
    - list: The list of selected feature names.
    """

    # Calculate the variances of each column
    variances = df.var()

    # Select only columns with variance greater than the threshold
    selected_columns = variances[variances > threshold].index.tolist()

    return selected_columns


In [8]:
# Joining the raw data and features dataframes
raw_ = raw.join(features)

# Splitting the raw_ dataframe into training and testing datasets
# test_size parameter determines the proportion of the dataset to include in the test split (in this case, 20%)
# random_state parameter is used to ensure reproducibility of the results
# stratify parameter is used to maintain the proportion of the target variable in both training and testing datasets
df_train, df_test = train_test_split(raw_, test_size=0.2, random_state=0, stratify=raw_.iloc[:, -1])

# Creating separate dataframes for the features and target variables in the training and testing datasets
X_train, X_test, y_train, y_test = df_train.drop(columns=["target"]), df_test.drop(columns=["target"]), df_train["target"], df_test["target"]


In [9]:
# Compute the median of each feature in the train data
medians = X_train.median()

# Fill in missing values in train data using median of corresponding feature
filled_train_data = X_train.fillna(medians)

# Fill in missing values in test data using median of corresponding feature from train data
filled_test_data = X_test.fillna(medians)

high_variance_features = select_high_variance_features(filled_train_data)
print('Num of features with high variance: ', len(high_variance_features))

X_train_high_variance = filled_train_data[high_variance_features]
X_test_high_variance = filled_test_data[high_variance_features]

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train_high_variance)
X_test = sc_X.transform(X_test_high_variance)

Num of features with high variance:  14845


In [10]:
def find_pca_components(data, variance_percentage):
    """
    Finds the number of PCA components required to retain a specified percentage of the explained variance.

    Parameters:
    - data (numpy.ndarray): The input data to transform using PCA.
    - variance_percentage (float): The percentage of the explained variance to retain.

    Returns:
    - int: The number of components required to retain the specified percentage of the explained variance.
    """

    # Create a PCA object with all components
    pca = PCA()

    # Fit the PCA object to the data
    pca.fit(data)

    # Calculate the cumulative explained variance ratio
    cum_var_ratio = np.cumsum(pca.explained_variance_ratio_)

    # Find the index of the first component that explains at least the specified percentage of the variance
    n_components = np.argmax(cum_var_ratio >= variance_percentage) + 1

    return n_components


In [11]:
# Specify the desired percentage of variance to be explained by the principal components
variance_percentage = 0.95

# Call the function find_pca_components to determine the number of principal components that explain the desired percentage of variance
n_components = find_pca_components(X_train, variance_percentage)

# Print the number of components and the percentage of variance they explain
print(f'{n_components} components explain {variance_percentage*100} % of the variance')

# Instantiate a PCA object with the specified number of principal components
pca = PCA(n_components)

# Fit the PCA object to the training data and transform the data into the new feature space
X_train = pca.fit_transform(X_train)

# Transform the test data into the new feature space using the PCA object fitted on the training data
X_test = pca.transform(X_test)


72 components explain 95.0 % of the variance


In [12]:
df_train = pd.DataFrame(X_train, index = filled_train_data.index).join(y_train)
df_test = pd.DataFrame(X_test, index = filled_test_data.index).join(y_test)

In [13]:
# Pickle the df_train dataframe
with open('df_train.pkl', 'wb') as f:
    pickle.dump(df_train, f)

# Pickle the df_test dataframe
with open('df_test.pkl', 'wb') as f:
    pickle.dump(df_test, f)

In [14]:
# Generate dataframes called 'df_normal' and 'df_lesional'

df_lesional = df_train[df_train['target'] == 1]
df_normal = df_train[df_train['target'] == 0]

# Define the function to create meta-samples
def create_meta_samples(df, num_samples=5):
    # Get the number of features
    num_features = df.shape[1] - 1  # assuming the last column is the target

    # Initialize an empty dataframe to store the meta-samples
    meta_samples = pd.DataFrame()

    # Loop over the number of samples
    for i in range(num_samples):
        # Compute the mean and standard deviation for each feature
        means = df.iloc[:, :-1].mean()
        stds = df.iloc[:, :-1].std()

        # Generate a random sample of the same size as the original data
        sample = pd.DataFrame(np.random.normal(means, stds, size=(1, num_features)),
                              columns=df.columns[:-1])

        # Add the target column from the original data
        sample['target'] = df['target'].mean()

        # Append the sample to the meta-samples dataframe
        meta_samples = pd.concat([meta_samples, sample], ignore_index=True)

    return meta_samples

# Create meta-samples for the normal data
df_normal_meta = create_meta_samples(df_normal, num_samples=5)

# Create meta-samples for the lesional data
df_lesional_meta = create_meta_samples(df_lesional, num_samples=5)

# Join the data together
df_meta = pd.concat([df_normal_meta,df_lesional_meta])

# Pickle the df_train_meta dataframe
with open('df_train_meta.pkl', 'wb') as f:
    pickle.dump(df_meta, f)


### Models traing and evaluation

The training and evaluating py file contains 3 classifiers, lgbm, SVM and KNN:

Two of them have internal regularization: 

* Support Vector Machines (SVMs): SVMs use regularization through the penalty parameter C that controls the trade-off between maximizing the margin and minimizing the classification error.

* Gradient Boosting: This is another ensemble learning method that combines multiple weak learners to form a strong learner. Gradient boosting has internal regularization through the shrinkage parameter that controls the step size of the gradient descent optimization.

The models are trained with the default parameters and with Baesian hyperparameter tuning and finaly all models both for basic and metadata data are compared in a table


In [15]:
# Import the Classifier trainer and selector
import Model_selection_part_1_2 as cls

In [16]:
# Run the py file
cls.main()


100%|███████████████████████████████████████████████| 10/10 [00:09<00:00,  1.08trial/s, best loss: -0.9787234042553191]
100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 23.57trial/s, best loss: -1.0]
100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 12.11trial/s, best loss: -1.0]
100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 25.05trial/s, best loss: -0.5]
100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 47.32trial/s, best loss: -1.0]
100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 45.17trial/s, best loss: -1.0]


In [17]:
# Present the comparrison
comparisson = pd.read_csv('Comparisson.csv', index_col = 0)
comparisson

Unnamed: 0,Data_type,Classifier,Accuracy Default score,Accuracy Bayesian score,Hyperparameters
0,Basic,<class 'lightgbm.sklearn.LGBMClassifier'>,1.0,1.0,Default params
0,Basic,<class 'sklearn.svm._classes.SVC'>,1.0,1.0,Default params
0,Basic,<class 'sklearn.neighbors._classification.KNei...,1.0,1.0,Default params
0,Meta,<class 'lightgbm.sklearn.LGBMClassifier'>,0.472222,0.472222,Default params
0,Meta,<class 'sklearn.svm._classes.SVC'>,1.0,0.888889,Default params
0,Meta,<class 'sklearn.neighbors._classification.KNei...,0.916667,0.777778,Default params


### Discussion

#### Who performed better?

In contrast to the models that were trained on basic data the preformance of the models trained on the metadata was less sucsessfull exept the SVM classifier with the default hiperparameters.

There are a few possible reasons why the SVM classifier with default parameters may have performed better than the LGBM and KNN classifiers on metadata:

* SVM is a powerful algorithm that can work well with high-dimensional data like metadata.
* The default parameters of the SVM classifier may have been well-suited to the metadata being used.
* The LGBM and KNN classifiers may not have been optimized or tuned for the specific metadata being used.
* The LGBM or KNN classifiers may have been overfitting or underfitting the data.
* The size and quality of the metadata used for training the classifiers could also play a role in determining their performance.

#### What are the potential implications of training on meta-samples?

Training models on meta-samples can have potential privacy implications. When multiple smaller datasets are combined into a meta-sample, it can increase the risk of re-identification or other privacy breaches, particularly if the smaller datasets contain sensitive information.

#### can you think of a way to use the meta-samples, and train better performing models?

Preheaps it us possible to use these samples in transfer learning if a large dataset with these genes exsists. Another options is data augmentation and it is always possible to try different models or voting classifier