In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import matthews_corrcoef
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import chi2_contingency
from scipy.stats import pearsonr

pd.set_option('future.no_silent_downcasting', True)

In [167]:
def introduce_missing_values(df, missing_rate = 0.1):
    df_missing = df.copy()
    n_missing = int(np.floor(missing_rate*df.size))
    for _ in range(n_missing):
        i = np.random.randint(0,df.shape[0])
        j = np.random.randint(0,df.shape[1])
        df_missing.iloc[i,j] = np.nan
    return df_missing
    

In [168]:
def introduce_outliers(df, columns, num_outliers=5, factor=5):
    for col in columns:
        outlier_indices = np.random.choice(df.index, size=num_outliers, replace=False)
        df.loc[outlier_indices, col] *= factor
    return df

In [169]:
def define_target(csv,target, learning_method='Supervised'):
    df = pd.read_csv(csv)
    ### Remove in prod
    df = introduce_missing_values(df)
    if learning_method == 'Supervised':
        df.dropna(subset=[target],inplace=True)
    
    features = df.drop(target, axis = 1)
    target_data = df[target]
    return df, features , target_data, target

In [170]:
df, features, target_data, target = define_target("./telco.csv", 'Churn')

In [171]:
def identify_numeric(df, inspect_columns =False):
    numeric_cols = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col].dropna()):
            numeric_cols.append(col)
            continue
        
        temp_conversion = pd.to_numeric(df[col], errors='coerce')
        numeric_count = temp_conversion.notna().sum()
        valid_count = df[col].notna().sum()
        numeric_ratio = numeric_count / valid_count 
        if valid_count > 0 and numeric_ratio > 0.95:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            numeric_cols.append(col)
            
    # df[numeric_cols] = df[numeric_cols].dropna().apply(
    # lambda x: x.astype('int64') if x.dropna().astype(float).apply(lambda v: v.is_integer()).all() else x.astype(float)
#)

    for col in numeric_cols:
        if df[col].dropna().astype(float).apply(lambda v: v.is_integer()).all():
            df[col] = df[col].astype('Int64')  # Keeps NaNs intact as NA-compatible type
        else:
            df[col] = df[col].astype(float)
        
    if inspect_columns == True:
        print(f'Numeric cols are:', numeric_cols)
        
    return numeric_cols

In [172]:
numeric_cols = identify_numeric(df,inspect_columns=True)

Numeric cols are: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [173]:
def identify_boolean(df):
    boolean_cols = []
    boolean_mappings = {}
    categorical_cols = []
    high_cardinality_cols = []
    
    for col in df.columns:
        unique_values = df[col].dropna().unique()
        normalized_values = [str(val).strip().lower() for val in unique_values]
        cardinality_ratio = len(unique_values) / len(df[col].dropna())
        num_na = df[col].isna().sum()
        
        if len(normalized_values) == 2 or (len(normalized_values) == 1 and num_na > 0 ):
            boolean_cols.append(col)
            sorted_values = sorted(unique_values, key=str)
            boolean_mappings[col] = {sorted_values[0]:0 , sorted_values[1]: 1}
            df[col] = df[col].dropna().replace(boolean_mappings[col]).astype(int)
            
            continue
        
        if len(unique_values) < 10:
            categorical_cols.append(col)
            continue

        if pd.api.types.is_numeric_dtype(df[col]):
            continue
        
        try:
            df[col] = pd.to_datetime(df[col].dropna(),errors='raise')
        except:
            pass
        
        if cardinality_ratio > 0.1:
            high_cardinality_cols.append(col)
            continue
        
        
        categorical_cols.append(col)
            
        
    return df, boolean_cols, boolean_mappings,high_cardinality_cols,categorical_cols

In [174]:
df,boolean_cols, boolean_mappings, high_cardinality_cols ,categorical_cols = identify_boolean(df)

  df[col] = pd.to_datetime(df[col].dropna(),errors='raise')


In [175]:
boolean_cols


['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'PaperlessBilling',
 'Churn']

In [115]:
def cleanup_numeric_cols (numeric_list, boolean_list, categorical_list):
    numerical_cleaned_bool = list(set(numeric_list) - set(boolean_list))
    numerical_cleaned_all = list(set(numerical_cleaned_bool) - set(categorical_list))
    return numerical_cleaned_all

In [116]:
numerical_cols = cleanup_numeric_cols(numeric_cols, boolean_cols, categorical_cols)

In [117]:

def detect_outliers(df, numerical_cols,id_col):
    IQR_report = []
    Z_score_report = []
    
    
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1- 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        median_value = df[col].median()
        
        iqr_outliers = df.loc[(df[col] < lower_bound) | (df[col] > upper_bound),['customerID',col]].dropna()
        for _,row in iqr_outliers.iterrows():
            
            IQR_report.append({
                'id' : row[id_col],
                'column': col,
                'outlier_type':'IQR',
                'value':row[col],
                'median': median_value,
            })
        
        mean = df[col].mean()
        std = df[col].std()
        z_scores = (df[col]-mean)/std
        z_score_outliers = df.loc[(np.abs(z_scores)>3),['customerID',col]].dropna()
        
        for idx,row in z_score_outliers.iterrows():
            Z_score_report.append({
                'id':row[id_col],
                'column':col,
                'outlier_type':'Z-score',
                'value':row[col],
                'z_score': z_scores.loc[idx]
            })       
        plt.figure(figsize=(6,3))
        plt.boxplot(df[col].dropna(), orientation='horizontal', whis=True)
        plt.title(f'Box and Whisker Plot for {col}')
        plt.show()
        IQR_df = pd.DataFrame(IQR_report)
        Z_score_df = pd.DataFrame(Z_score_report)
        
    
    print('---------------------------IQR Report---------------------------')
    display(IQR_df.style)
    
    print('----------------------Z-score Report ----------------------------')
    display(Z_score_df.style)
    return IQR_df, Z_score_df


In [118]:
boolean_cols

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'PaperlessBilling']

In [None]:
def binary_correlation(df,bool_cols,target):
    for col in bool_cols:
        

In [None]:
def imputeBool(df, bool_cols, impute_strategy='auto'):
    df_size = len(df)
    if impute_strategy == 'auto':
        for col in bool_cols:
            if df[col].isnull().mean()<0.05:
                df[col].fillna(df[col].mode()[0], inplace = True)
                print(f'{col} imputed with modal strategy')
            
            
            if df_size < 7000:  
            
    if impute_strategy == 'mode':
        for col in bool_cols:
            if df[col].isnull().sum() > 0:
                if impute_strategy =='mode':
                    df[col].fillna(df[col].mode()[0])
                    
    if impute_strategy == 'missing':
        

SyntaxError: incomplete input (1058195472.py, line 4)

customerID          0.093060
gender              0.094313
SeniorCitizen       0.089456
Partner             0.100423
Dependents          0.099013
tenure              0.094156
PhoneService        0.094156
MultipleLines       0.100580
InternetService     0.088830
OnlineSecurity      0.094783
OnlineBackup        0.098386
DeviceProtection    0.093373
TechSupport         0.097446
StreamingTV         0.096036
StreamingMovies     0.096350
Contract            0.099483
PaperlessBilling    0.102146
PaymentMethod       0.088673
MonthlyCharges      0.105436
TotalCharges        0.093373
Churn               0.000000
dtype: float64