# Example of DigCNV

In [1]:
from DigCNV import dataVerif, dataPreparation, digCnvModel, DigCnvPreProcessing
import pandas as pd

In [2]:
# Path to cnvs that will be classified
cnvs = pd.read_csv('DigCNV/data/example_cnvs.tsv', sep='\t')

# List of mandatories columns to use pre the model.
model_dimensions = ['WF', 'Score_SNP', 'DENSITY', 'CallRate', 'overlapCNV_SegDup', 'TwoAlgs', 'Nb_Probe_tech']

# Column name of the true classification used for the training
true_class_name = 'SnipPeep_Ok'

In [14]:
# cnvs = pd.read_csv('DigCNV/data/UKBB_clean_for_DigCNV.tsv', sep='\t')
# cnvs.rename(columns={'LociStart':'START',
#                     'LociStop':'STOP',
#                     'Chr':'CHR'}, inplace=True)
# print(cnvs.columns.tolist())
# cnvs["SampleID"] = ["IID_" + str(x) for x in cnvs.index.tolist()]

# cnvs_clean = cnvs[['SampleID','START', 'STOP', 'CHR', 'SCORE', 'SNP', 'TwoAlgs', 'overlapCNV_SegDup', 'overlapCNV_Centromere', 'LRR_mean', 'LRR_SD', 'BAF_mean', 'WF', 'GCWF', 'SnipPeep_Ok']]
# cnvs_clean.to_csv('./DigCNV/data/example_cnvs.tsv', sep='\t', index=False)
# callrates = cnvs[['SampleID', 'CallRate']]
# callrates.drop_duplicates(inplace=True)
# callrates.to_csv('./DigCNV/data/example_callrates.tsv', sep='\t', index=False)

['Unnamed: 0', 'IID', 'SampleID', 'CHR', 'START', 'STOP', 'Type', 'SCORE', 'SNP', 'SIZE', 'numbAlgos', 'Algos', 'ThreeAlgs', 'TwoAlgs', 'OneAlg', 'CallRate', 'overlapCNV_SegDup', 'overlapCNV_Centromere', 'overlapRegion_Centromere', 'LRR_mean', 'LRR_median', 'LRR_SD', 'BAF_mean', 'BAF_median', 'BAF_SD', 'BAF_DRIFT', 'WF', 'GCWF', 'TYPE', 'Phase', 'Final_observation', 'Commentaire_final', 'Tech', 'Batch', 'Type.1', 'SnipPeep_Ok', 'DENSITY', 'Score_SNP', 'Nb_Probe_tech']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  callrates.drop_duplicates(inplace=True)


## Prepare dataset for DigCNV
### Check data structure before adding other useful columns

In [3]:
dataVerif.checkIfMandatoryColumnsExist(cnvs, post_data_preparation=False)
dataVerif.checkColumnsformats(cnvs, post_data_preparation=False)

2022-08-29 14:56:48,012 :: checkIfMandatoryColumnsExist	- All mandatory columns exist in the given dataframe


### Adding new features comming from derived data or other datasets

In [4]:
cnvs = dataPreparation.addDerivedFeatures(cnvs)
# cnvs = dataPreparation.addChromosomicAnnotation(cnvs)
cnvs = dataPreparation.addCallRateToDataset(cnvs, call_rate_path='./DigCNV/data/callrates.tsv', callrate_colname='CallRate', individual_colname='SampleID')
cnvs = dataPreparation.addNbProbeByTech(cnvs, pfb_file_path='./DigCNV/data/UKBB_PFB.pfb')
cnvs = dataPreparation.transformTwoAlgsFeatures(cnvs)


2022-08-29 14:56:48,218 :: addDerivedFeatures	- Derived features (DENSITY and Score_SNP columns) created
2022-08-29 14:56:48,270 :: addCallRateToDataset	- CallRate added to dataset
2022-08-29 14:56:48,662 :: addNbProbeByTech	- Number of probes in technology added after counting number of lines in pfb file
2022-08-29 14:56:48,664 :: addNbProbeByTech	- Number of probes in technology added
2022-08-29 14:56:48,685 :: transformTwoAlgsFeatures	- Keep TwoAlgs function into percentage format
2022-08-29 14:56:48,688 :: checkIfMandatoryColumnsExist	- All mandatory columns exist in the given dataframe


### Checking data before prediction

In [None]:
dataVerif.checkIfMandatoryColumnsExist(cnvs, post_data_preparation=True)
dataVerif.checkColumnsformats(cnvs, post_data_preparation=True)
dataVerif.computeNaPercentage(cnvs, dimensions=model_dimensions)
dataVerif.plotCorrelationHeatMap(cnvs, list_dim=model_dimensions, output_path='./outputs/correlation.png')

## Run model with pretrained model

In [None]:
model = digCnvModel.DigCnvModel()
model.openPreTrainedDigCnvModel(model_path='./DigCNV/data/DigCnvModel_Trained_Mega_Spark_Ukbb.pkl')
predicted_cnvs = model.predictCnvClasses(cnvs)
print(predicted_cnvs.shape)
print(predicted_cnvs.DigCNVpred.value_counts())

## Train model with given dataset

### Prepare datasets for training and testing

In [None]:
cnvs, removed = DigCnvPreProcessing.removeLinesWithNA(cnvs, dimensions=model_dimensions)
X_train, y_train, X_test, y_test = DigCnvPreProcessing.createTrainingTestingDatasets(cnvs, X_dimension=true_class_name)
X_train, y_train = DigCnvPreProcessing.uniformizeClassesSizes(X_train, y_train, 17, 0.4, 0.5)

### Tunning hyperparameters used in the model (Optional, takes a lot of time)

### Training DigCNV model with given CNVs

In [None]:
dig_cnv = digCnvModel.DigCnvModel()
dig_cnv.createDigCnvClassifier()
dig_cnv.trainDigCnvModel(training_data=X_train, training_cat=y_train)

### Evaluate model power + save model

In [None]:
dig_cnv.evaluateCnvClassification(testing_df=X_test, expected_valeues=y_test)
dig_cnv.saveDigCnvModelToPkl(output_path="./outputs/trained_model.pkl")