<h1 style='font-size: 15px;  font-weight: 600'>1.0: Import Required Libraries</h1>

In [46]:
import pandas as pd  
import numpy as np 
import warnings 
import re

warnings.simplefilter("ignore")  
pd.set_option('display.float_format', lambda x: '%.2f' % x) 
print("......Libraries Loaded Successfully.........")

......Libraries Loaded Successfully.........


<h1 style='font-size: 15px;  font-weight: 600'>2.0: Import and Preprocessing Dataset</h1>

In [48]:
pd.set_option('display.max_columns', 10) 

filepath = "./Datasets/Lung Cancer Survey.csv"
df = pd.read_csv(filepath)
df.sample(10)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,...,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
112,Female,68,NO,NO,NO,...,YES,YES,NO,YES,YES
92,Male,52,YES,NO,NO,...,NO,NO,YES,YES,YES
291,Male,71,YES,YES,YES,...,YES,NO,YES,YES,YES
110,Male,58,NO,YES,YES,...,YES,YES,YES,YES,YES
128,Female,58,YES,YES,YES,...,YES,YES,YES,NO,YES
208,Male,67,NO,YES,YES,...,YES,NO,YES,NO,YES
253,Female,67,YES,YES,YES,...,YES,YES,YES,NO,YES
244,Female,64,YES,YES,YES,...,NO,YES,NO,YES,YES
3,Male,63,YES,YES,YES,...,NO,NO,YES,YES,NO
137,Male,67,NO,NO,NO,...,YES,YES,NO,YES,YES


<h1 style='font-size: 15px; font-weight: 600'>3.0: Dataset Informations/ Overview</h1>

In [49]:
df.shape

(309, 16)

In [50]:
def rename_column(text):
    text = text.title()                      
    text = re.sub(r'[^\w\s]', '_', text)
    return text

df.columns = df.columns.to_series().apply(rename_column)
for column in df.columns.tolist(): print(f"{'-'*25} {column}")

------------------------- Gender
------------------------- Age
------------------------- Smoking
------------------------- Yellow_Fingers
------------------------- Anxiety
------------------------- Peer_Pressure
------------------------- Chronic Disease
------------------------- Fatigue 
------------------------- Allergy 
------------------------- Wheezing
------------------------- Alcohol Consuming
------------------------- Coughing
------------------------- Shortness Of Breath
------------------------- Swallowing Difficulty
------------------------- Chest Pain
------------------------- Lung_Cancer


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Gender                 309 non-null    object
 1   Age                    309 non-null    int64 
 2   Smoking                309 non-null    object
 3   Yellow_Fingers         309 non-null    object
 4   Anxiety                309 non-null    object
 5   Peer_Pressure          309 non-null    object
 6   Chronic Disease        309 non-null    object
 7   Fatigue                309 non-null    object
 8   Allergy                309 non-null    object
 9   Wheezing               309 non-null    object
 10  Alcohol Consuming      309 non-null    object
 11  Coughing               309 non-null    object
 12  Shortness Of Breath    309 non-null    object
 13  Swallowing Difficulty  309 non-null    object
 14  Chest Pain             309 non-null    object
 15  Lung_Cancer            

<h4 style='font-size: 14px; color: blue; font-weight: 600'>3.2: Columns Summary</h4>

In [52]:
def column_summary(df):
    summary_data = []
    for col_name in df.columns:
        col_dtype = df[col_name].dtype
        num_of_nulls = df[col_name].isnull().sum()
        num_of_non_nulls = df[col_name].notnull().sum()
        num_of_distinct_values = df[col_name].nunique()
        
        if num_of_distinct_values <= 10:
            distinct_values_counts = df[col_name].value_counts().to_dict()
        else:
            top_10_values_counts = df[col_name].value_counts().head(10).to_dict()
            distinct_values_counts = {k: v for k, v in sorted(top_10_values_counts.items(), key=lambda item: item[1], reverse=True)}

        summary_data.append({
            'col_name': col_name,
            'col_dtype': col_dtype,
            'num_of_nulls': num_of_nulls,
            'num_of_non_nulls': num_of_non_nulls,
            'num_of_distinct_values': num_of_distinct_values,
            'distinct_values_counts': distinct_values_counts
        })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

summary_df = column_summary(df)
display(summary_df)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,Gender,object,0,309,2,"{'Male': 162, 'Female': 147}"
1,Age,int64,0,309,39,"{64: 20, 63: 19, 56: 19, 62: 18, 60: 17, 61: 1..."
2,Smoking,object,0,309,2,"{'YES': 174, 'NO': 135}"
3,Yellow_Fingers,object,0,309,2,"{'YES': 176, 'NO': 133}"
4,Anxiety,object,0,309,2,"{'NO': 155, 'YES': 154}"
5,Peer_Pressure,object,0,309,2,"{'YES': 155, 'NO': 154}"
6,Chronic Disease,object,0,309,2,"{'YES': 156, 'NO': 153}"
7,Fatigue,object,0,309,2,"{'YES': 208, 'NO': 101}"
8,Allergy,object,0,309,2,"{'YES': 172, 'NO': 137}"
9,Wheezing,object,0,309,2,"{'YES': 172, 'NO': 137}"


<h4 style='font-size: 14px; color: blue;   font-weight: 600'>3.3: Exploring Invalid Entries Dtypes</h4>

Exploring invalid entries in data types involves identifying values that do not match the expected format or category within each column. This includes detecting inconsistencies such as numerical values in categorical fields, incorrect data formats, or unexpected symbols and typos. Invalid entries can lead to errors in analysis and model performance, making it essential to standardize data types and correct anomalies.

In [53]:
def simplify_dtype(dtype):
    if dtype in (int, float, np.number): return 'Numeric'
    elif np.issubdtype(dtype, np.datetime64): return 'Datetime'
    elif dtype == str: return 'String'
    elif dtype == type(None): return 'Missing'
    else: return 'Other'

def analyze_column_dtypes(df):
    all_dtypes = {'Numeric', 'Datetime', 'String', 'Missing', 'Other'}
    results = pd.DataFrame(index=df.columns, columns=list(all_dtypes), dtype=object).fillna('-')
    
    for column in df.columns:
        dtypes = df[column].apply(lambda x: simplify_dtype(type(x))).value_counts()
        percentages = (dtypes / len(df)) * 100
        for dtype, percent in percentages.items():
            if percent > 0:
                results.at[column, dtype] = f'{percent:.2f}%'  # Add % sign and format to 2 decimal places
            else:
                results.at[column, dtype] = '-'  # Add dash for 0%
    return results

results = analyze_column_dtypes(df)
display(results)

Unnamed: 0,Other,Numeric,Datetime,String,Missing
Gender,-,-,-,100.00%,-
Age,-,100.00%,-,-,-
Smoking,-,-,-,100.00%,-
Yellow_Fingers,-,-,-,100.00%,-
Anxiety,-,-,-,100.00%,-
Peer_Pressure,-,-,-,100.00%,-
Chronic Disease,-,-,-,100.00%,-
Fatigue,-,-,-,100.00%,-
Allergy,-,-,-,100.00%,-
Wheezing,-,-,-,100.00%,-


<h1 style='font-size: 25px; font-family: Colonna MT; font-weight: 600'>4.0: Statistic Description of The Datasets</h1>

Now, let's examine the descriptive statistics of the data using Pandas’ built-in functions. This step provides a quick summary of key statistical measures like unique, top and frequency, giving an initial overview of the dataset’s distribution and characteristics before applying more detailed custom analyses. This gives us a bird's-eye view of the data, helping us understand the general distribution and characteristics of the values.

In [45]:
summary_statistics = df.describe(include='object').T.reset_index()
summary_statistics

Unnamed: 0,index,count,unique,top,freq
0,Gender,309,2,Male,162
1,Smoking,309,2,YES,174
2,Yellow_Fingers,309,2,YES,176
3,Anxiety,309,2,NO,155
4,Peer_Pressure,309,2,YES,155
5,Chronic Disease,309,2,YES,156
6,Fatigue,309,2,YES,208
7,Allergy,309,2,YES,172
8,Wheezing,309,2,YES,172
9,Alcohol Consuming,309,2,YES,172


<h4 style='font-size: 18px; color: Blue; font-family: colonna mt; font-weight: 600'>6.4: Distributions of Categorical Variables</h4>

Now, let’s explore the counts and proportions of categorical variables, both individually and across different groups. Counts indicate how frequently each category appears, providing insight into the overall distribution. Proportions, on the other hand, show the relative frequency of each category compared to the total, helping to identify balance or imbalance. Examining these aspects across groups can reveal patterns and relationships that may be important for further analysis. This exploration is essential for understanding the structure and distribution of categorical variables.

In [34]:
def categorical_variables_distributions(df, categories):
    data = []
    for category in categories:
        counts = df[category].value_counts()
        proportions = df[category].value_counts(normalize=True)
        for value, count in counts.items():
            proportion = proportions[value]
            data.append({
                'Category': category,
                'Value': value,
                'Count': count,
                'Proportion': f"{proportion:.2%}" 
            })
    
    result_df = pd.DataFrame(data)
    return result_df


excluded_cols = []
Variables = df.drop(columns=excluded_cols).select_dtypes(include=['object']).columns
Results = categorical_variables_distributions(df, Variables)
Results.head(10)

Unnamed: 0,Category,Value,Count,Proportion
0,Gender,Male,162,52.43%
1,Gender,Female,147,47.57%
2,Smoking,YES,174,56.31%
3,Smoking,NO,135,43.69%
4,Yellow_Fingers,YES,176,56.96%
5,Yellow_Fingers,NO,133,43.04%
6,Anxiety,NO,155,50.16%
7,Anxiety,YES,154,49.84%
8,Peer_Pressure,YES,155,50.16%
9,Peer_Pressure,NO,154,49.84%


<h4 style='font-size: 18px; color: Blue; font-family: colonna mt; font-weight: 600'>6.4: Variable Against Variables Distributions</h4>

Imagine you’re analyzing a dataset where categories belong to different groups, and you want to see how they are distributed. The below function helps by grouping data based on a selected categorical variable and then counting how often each category appears within those groups. It also calculates proportions to show the relative frequency of each category, making it easier to spot patterns or imbalances. By structuring this information clearly, we gain a deeper understanding of how different categories interact across groups, which is essential for meaningful analysis.

In [36]:
def Distributions_of_Categorical_Variables_with_others(df, group_column, categories):
    data = []
    for category in categories:
        if category == group_column:
            continue  # Skip comparing the group_column with itself
        
        grouped = df.groupby(group_column)[category].value_counts(normalize=False)  # Counts
        grouped_proportions = df.groupby(group_column)[category].value_counts(normalize=True)  # Proportions
        
        for group, values in grouped.groupby(level=0):  # Level 0 is the group_column
            for (g, value), count in values.items():
                proportion = grouped_proportions[(group, value)]
                
                data.append({
                    'Main-Group': group_column,
                    'Sub-Group': group,
                    'Category': category,
                    'Value': value,
                    'Count': count,
                    'Proportion': f"{proportion:.2%}"  # Format proportion as percentage
                })
    result_df = pd.DataFrame(data)
    return result_df

categorical_variables = df.select_dtypes(include=['object']).columns
group_column = 'Lung_Cancer'
Results = Distributions_of_Categorical_Variables_with_others(df, group_column, categorical_variables)
Results.head(10)

Unnamed: 0,Main-Group,Sub-Group,Category,Value,Count,Proportion
0,Lung_Cancer,NO,Gender,Female,22,56.41%
1,Lung_Cancer,NO,Gender,Male,17,43.59%
2,Lung_Cancer,YES,Gender,Male,145,53.70%
3,Lung_Cancer,YES,Gender,Female,125,46.30%
4,Lung_Cancer,NO,Smoking,NO,20,51.28%
5,Lung_Cancer,NO,Smoking,YES,19,48.72%
6,Lung_Cancer,YES,Smoking,YES,155,57.41%
7,Lung_Cancer,YES,Smoking,NO,115,42.59%
8,Lung_Cancer,NO,Yellow_Fingers,NO,26,66.67%
9,Lung_Cancer,NO,Yellow_Fingers,YES,13,33.33%


<h1 style='font-size: 20px; color: Green; font-family: Candara; font-weight: 600'>6.0: Association Analysis (Realationship Between Categorical Variables)</h1>

In [54]:
def cramers_v(contingency_table):
    chi2 = stats.chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    r, k = contingency_table.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

def association_analysis(df, col1, col2, alpha=0.05):
    contingency_table = pd.crosstab(df[col1], df[col2])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    cramers_v_value = cramers_v(contingency_table)
    interpretation = "Significant Association" if p < alpha else "Not Significant"
    
    results = pd.DataFrame({
        'Variable 1': [col1],
        'Variable 2': [col2],
        'Chi-Square': [chi2],
        'p-value': [p],
        'Degrees of Freedom': [dof],
        'Cramér’s V': [cramers_v_value],
        'Interpretation': [interpretation]
    })
    return results


def analyze_multiple_pairs(df, categorical_vars, alpha=0.05):
    results_list = []
    for i, var1 in enumerate(categorical_vars):
        for var2 in categorical_vars[i+1:]:
            results_list.append(association_analysis(df, var1, var2, alpha))
    
    return pd.concat(results_list, ignore_index=True)

categorical_vars = df.select_dtypes(include=['object']).columns
results_df = analyze_multiple_pairs(df, categorical_vars)
results_df = results_df[results_df['Variable 1'] == 'Gender']
results_df

Unnamed: 0,Variable 1,Variable 2,Chi-Square,p-value,Degrees of Freedom,Cramér’s V,Interpretation
0,Gender,Smoking,0.27,0.6,1,0.03,Not Significant
1,Gender,Yellow_Fingers,13.17,0.0,1,0.21,Significant Association
2,Gender,Anxiety,6.55,0.01,1,0.15,Significant Association
3,Gender,Peer_Pressure,22.37,0.0,1,0.27,Significant Association
4,Gender,Chronic Disease,12.13,0.0,1,0.2,Significant Association
5,Gender,Fatigue,1.82,0.18,1,0.08,Not Significant
6,Gender,Allergy,6.74,0.01,1,0.15,Significant Association
7,Gender,Wheezing,5.61,0.02,1,0.13,Significant Association
8,Gender,Alcohol Consuming,61.95,0.0,1,0.45,Significant Association
9,Gender,Coughing,4.96,0.03,1,0.13,Significant Association



<h4 style='font-size: 18px; color: blue; font-family: Candara; font-weight: 600'>8.5. Cramér's V and Phi coefficient.</h4>

Cramér's V measures the strength of association between two categorical variables.
    It ranges from 0 (no association) to 1 (perfect association). It is an extension
    of the Phi coefficient for tables larger than 2x2. The value is adjusted based on
    the size of the contingency table to provide a normalized measure of association.

The Phi coefficient is a measure of association between two binary (2x2) categorical variables.
    It is similar to the Pearson correlation coefficient but is used for categorical data.
    Like Cramér's V, it ranges from 0 (no association) to 1 (perfect association), but it is
    specifically designed for 2x2 contingency tables.

In [40]:
from itertools import combinations
from typing import List, Dict, Union, Optional
from dataclasses import dataclass
from scipy import stats

@dataclass
class Effect_size_results:
    variable1: str
    variable2: str
    cramers_v: float
    phi_coefficient: Optional[float]
    interpretation: str

class EffectSizeAnalyzer:
    EFFECT_SIZE_THRESHOLDS = {
        'Negligible': 0.1,
        'Small': 0.3,
        'Medium': 0.5,
        'Large': float('inf')
    }
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.categorical_vars = self._get_categorical_variables()
        
    def _get_categorical_variables(self) -> List[str]:
        return self.df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    @staticmethod
    def _compute_cramers_v(contingency_table: pd.DataFrame) -> float:
        chi2 = stats.chi2_contingency(contingency_table)[0]
        n = contingency_table.sum().sum()
        min_dim = min(contingency_table.shape) - 1
        return np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0.0
    
    @staticmethod
    def _compute_phi_coefficient(contingency_table: pd.DataFrame) -> Optional[float]:
        if contingency_table.shape == (2, 2):
            chi2 = stats.chi2_contingency(contingency_table, correction=False)[0]
            n = contingency_table.sum().sum()
            return np.sqrt(chi2 / n)
        return None
    
    @classmethod
    def _interpret_effect_size(cls, value: float) -> str:
        for interpretation, threshold in cls.EFFECT_SIZE_THRESHOLDS.items():
            if value < threshold:
                return interpretation
        return 'Large'
    
    def compute_pairwise_effect_sizes(self, filter_var: Optional[str] = None, sort_by: str = 'cramers_v') -> pd.DataFrame:
        results = []
        
        for var1, var2 in combinations(self.categorical_vars, 2):
            if filter_var and filter_var not in (var1, var2):
                continue

            contingency_table = pd.crosstab(self.df[var1], self.df[var2])
            cramers_v = self._compute_cramers_v(contingency_table)
            phi = self._compute_phi_coefficient(contingency_table)
            
            results.append(Effect_size_results(
                variable1=var1,
                variable2=var2,
                cramers_v=cramers_v,
                phi_coefficient=phi,
                interpretation=self._interpret_effect_size(cramers_v))
            )
        
        result_df = pd.DataFrame([vars(r) for r in results])
        
        if sort_by in result_df.columns:
            result_df = result_df.sort_values(by=sort_by, ascending=False)
            
        return result_df.reset_index(drop=True)


analyzer = EffectSizeAnalyzer(df)
results = analyzer.compute_pairwise_effect_sizes(filter_var='Lung_Cancer')
results

Unnamed: 0,variable1,variable2,cramers_v,phi_coefficient,interpretation
0,Allergy,Lung_Cancer,0.32,0.33,Medium
1,Alcohol Consuming,Lung_Cancer,0.28,0.29,Small
2,Swallowing Difficulty,Lung_Cancer,0.25,0.26,Small
3,Wheezing,Lung_Cancer,0.24,0.25,Small
4,Coughing,Lung_Cancer,0.24,0.25,Small
5,Chest Pain,Lung_Cancer,0.18,0.19,Small
6,Peer_Pressure,Lung_Cancer,0.18,0.19,Small
7,Yellow_Fingers,Lung_Cancer,0.17,0.18,Small
8,Fatigue,Lung_Cancer,0.14,0.15,Small
9,Anxiety,Lung_Cancer,0.14,0.14,Small
