# Handling duplicates

1. [Packages](#Pachages)
2. [Loading data](#Loading_data)
3. [Handling duplicates](#Handling_duplicates)

### Packages 

In [2]:
import os
#Change current directory
os.chdir ('/home/hamza_hajjini@MCC.DOMAIN/bcppmchurn')
#Check the current directory
os.getcwd()

'/home/hamza_hajjini@MCC.DOMAIN/bcppmchurn'

In [3]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import os
from datetime import datetime 
from scipy import stats
from pyspark.sql import SparkSession
from pyspark.sql import functions
import yaml 

from sklearn.model_selection import train_test_split 

from src.components.data_ingestion import get_feature_tables_from_impala, get_churn_target 
from src.components.data_structuring import structuringPipeline
from src.eda import utils
from src.eda.utils import columnsFamilies

In [4]:
#Get todays's date
date_time = datetime.today().strftime("%Y-%m-%d")
#Declare paths
train_dev_test_path = "data/train_dev_test"
data_path = "data/experiments_data"
data_samples_path = "data/data_samples"
artifacts_path = "artifacts/eda"

In [5]:
#Reload modules in case a change has occured
#import importlib
#from src.components import data_ingestion, data_structuring
#importlib.reload(data_ingestion)
#importlib.reload(data_structuring)

### Loading data

--------------------

In [31]:
#Loading data. Ensure that dn is a str 
data_date = "2024-10-11"
df_train = pd.read_csv(f"{train_dev_test_path}/{data_date}_df_train.csv", index_col = 0, dtype= {"dn": "string"}) 
df_dev = pd.read_csv(f"{train_dev_test_path}/{data_date}_df_dev.csv", index_col = 0, dtype= {"dn": "string"})
df_test = pd.read_csv(f"{train_dev_test_path}/{data_date}_df_test.csv", index_col = 0, dtype= {"dn": "string"})

'df_train["dn"] = df_train["dn"].astype("string")\ndf_dev["dn"] = df_dev["dn"].astype("string")\ndf_test["dn"] = df_test["dn"].astype("string")'

In [35]:
#Quick check
print (f"df_train shape :{df_train.shape}")
print (f"df_dev shape: {df_dev.shape}")
print (f"df_test shape: {df_test.shape}")
print (f"column dn's type is : {df_train.dn.dtype }")

df_train shape :(150000, 3582)
df_dev shape: (25000, 3582)
df_test shape: (10000, 3582)
column dn's type is : string


--------------

### Handling Duplicates 

Drop duplicated dns

In [63]:
df_train.drop ( df_train [df_train["dn"].duplicated()].index, inplace = True)

In [71]:
print (f"Number of duplicated dns : {df_train.dn.duplicated().sum()}") 

Number of duplicated dns : 0


In [36]:
df_train_without_dn = df_train[df_train.columns[1:]]
print (f"number of duplicated behaviors (exculding dn) is: {df_train_without_dn.duplicated().sum()}")

number of duplicated behaviors (exculding dn) is: 1763


In [37]:
#Show some duplicated rows
duplicated_rows = df_train_without_dn[df_train_without_dn.duplicated()]
duplicated_rows.head()

Unnamed: 0,gamme,churn_segment,churn_date,activation_bscs_date,id_date,complaints_complaints_complaint_status_abondon_duration_nb_1m,complaints_complaints_complaint_status_abondon_total_nb_1m,complaints_complaints_complaint_status_autre_duration_nb_1m,complaints_complaints_complaint_status_autre_total_nb_1m,complaints_complaints_complaint_status_clos_duration_nb_1m,...,voice_call_direction_oc_duration_value_1_3w_3_4w,voice_destination_type_international_duration_value_1_3w_3_4w,voice_destination_type_national_duration_value_1_3w_3_4w,voice_termination_type_offnet_duration_value_1_3w_3_4w,voice_termination_type_onnet_duration_value_1_3w_3_4w,voice_weekend_n_duration_value_1_3w_3_4w,voice_weekend_y_duration_value_1_3w_3_4w,voice_workingh_n_duration_value_1_3w_3_4w,voice_workingh_y_duration_value_1_3w_3_4w,churn
636882,Forfaits 49 dhs,non_churners,2024-09-27,,20240607,,,,,,...,,,,,,,,,,0
95812,Forfaits 99 dhs,non_churners,2024-09-27,,20240607,,,,,,...,,,,,,,,,,0
145859,Forfaits Hors 99 dhs,non_churners,2024-09-27,,20240607,,,,,,...,,,,,,,,,,0
219656,Forfaits 99 dhs,non_churners,2024-09-27,,20240607,,,,,,...,,,,,,,,,,0
677118,Forfaits 49 dhs,non_churners,2024-09-27,,20240607,,,,,,...,,,,,,,,,,0


Are these duplicated rows all null ? meaning are all their values missing values ?

In [38]:
#See if all duplicated rows contains only missing values
#Exclude churn columns from duplicated rows 
features_duplicated_rows = duplicated_rows [duplicated_rows.columns[5:-1]]
columnsToMissingValues_features_duplicated_rows =  pd.DataFrame({"columns": features_duplicated_rows.columns,
                                                                "perc_missing_values": (features_duplicated_rows.isna().sum()/len(features_duplicated_rows))*100})
columnsToMissingValues_features_duplicated_rows.reset_index(drop = True, inplace=True)
print ("Nbr of duplicated rows that acctually contains values :")
len (columnsToMissingValues_features_duplicated_rows[columnsToMissingValues_features_duplicated_rows["perc_missing_values"]<100 ])

Nbr of duplicated rows that acctually contains values :


1375