In [10]:
from typing import List, Dict, Union, Optional
from itertools import combinations
from dataclasses import dataclass
from scipy import stats
import pandas as pd  
import numpy as np 
import warnings 
import re

pd.set_option('display.float_format', lambda x: '%.2f' % x) 
print("......Libraries Loaded Successfully.........")

......Libraries Loaded Successfully.........


In [6]:
pd.set_option('display.max_columns', 10) 
filepath = "./Datasets/Lung Cancer Survey.csv"
df = pd.read_csv(filepath)
df.sample(10)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,...,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
91,Female,71,NO,NO,NO,...,YES,YES,NO,YES,YES
129,Female,63,NO,NO,NO,...,NO,YES,NO,NO,NO
44,Female,70,YES,YES,NO,...,YES,NO,YES,NO,YES
32,Male,56,YES,YES,YES,...,NO,YES,YES,NO,NO
302,Female,65,YES,YES,YES,...,YES,YES,YES,NO,YES
28,Female,53,YES,YES,YES,...,NO,YES,YES,YES,YES
206,Female,71,YES,YES,YES,...,YES,YES,NO,YES,YES
237,Female,54,YES,YES,YES,...,YES,NO,YES,YES,YES
93,Female,59,NO,YES,YES,...,YES,YES,YES,NO,YES
225,Male,59,YES,NO,NO,...,YES,YES,YES,YES,YES


In [7]:
def rename_column(text):                      
    text = re.sub(r'[^\w\s]', '_', text)
    text = text.title()
    return text

df.columns = df.columns.to_series().apply(rename_column)
for column in df.columns.tolist(): print(f"{'-'*25} {column}")

------------------------- Gender
------------------------- Age
------------------------- Smoking
------------------------- Yellow_Fingers
------------------------- Anxiety
------------------------- Peer_Pressure
------------------------- Chronic Disease
------------------------- Fatigue 
------------------------- Allergy 
------------------------- Wheezing
------------------------- Alcohol Consuming
------------------------- Coughing
------------------------- Shortness Of Breath
------------------------- Swallowing Difficulty
------------------------- Chest Pain
------------------------- Lung_Cancer


<h4 style='font-size: 14px; color: blue; font-weight: 600'>3.2: Columns Profiling</h4>

In [8]:
def column_summary(df):
    summary_data = []
    for col_name in df.columns:
        col_dtype = df[col_name].dtype
        num_of_nulls = df[col_name].isnull().sum()
        num_of_non_nulls = df[col_name].notnull().sum()
        num_of_distinct_values = df[col_name].nunique()
        
        if num_of_distinct_values <= 10:
            distinct_values_counts = df[col_name].value_counts().to_dict()
        else:
            top_10_values_counts = df[col_name].value_counts().head(10).to_dict()
            distinct_values_counts = {k: v for k, v in sorted(top_10_values_counts.items(), key=lambda item: item[1], reverse=True)}

        summary_data.append({
            'col_name': col_name,
            'col_dtype': col_dtype,
            'num_of_nulls': num_of_nulls,
            'num_of_non_nulls': num_of_non_nulls,
            'num_of_distinct_values': num_of_distinct_values,
            'distinct_values_counts': distinct_values_counts
        })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

summary_df = column_summary(df)
display(summary_df)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,Gender,object,0,309,2,"{'Male': 162, 'Female': 147}"
1,Age,int64,0,309,39,"{64: 20, 63: 19, 56: 19, 62: 18, 60: 17, 61: 1..."
2,Smoking,object,0,309,2,"{'YES': 174, 'NO': 135}"
3,Yellow_Fingers,object,0,309,2,"{'YES': 176, 'NO': 133}"
4,Anxiety,object,0,309,2,"{'NO': 155, 'YES': 154}"
5,Peer_Pressure,object,0,309,2,"{'YES': 155, 'NO': 154}"
6,Chronic Disease,object,0,309,2,"{'YES': 156, 'NO': 153}"
7,Fatigue,object,0,309,2,"{'YES': 208, 'NO': 101}"
8,Allergy,object,0,309,2,"{'YES': 172, 'NO': 137}"
9,Wheezing,object,0,309,2,"{'YES': 172, 'NO': 137}"


<h4 style='font-size: 14px; color: blue; font-weight: 600'>3.2: Association Analysis</h4>

In [5]:
def cramers_v(contingency_table):
    chi2 = stats.chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    r, k = contingency_table.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

def association_analysis(df, col1, col2, alpha=0.05):
    contingency_table = pd.crosstab(df[col1], df[col2])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    cramers_v_value = cramers_v(contingency_table)
    interpretation = "Significant Association" if p < alpha else "Not Significant"
    
    results = pd.DataFrame({
        'Variable 1': [col1],
        'Variable 2': [col2],
        'Chi-Square': [chi2],
        'p-value': [p],
        'Degrees of Freedom': [dof],
        'Cramér’s V': [cramers_v_value],
        'Interpretation': [interpretation]
    })
    return results


def analyze_multiple_pairs(df, categorical_vars, alpha=0.05):
    results_list = []
    for i, var1 in enumerate(categorical_vars):
        for var2 in categorical_vars[i+1:]:
            results_list.append(association_analysis(df, var1, var2, alpha))
    
    return pd.concat(results_list, ignore_index=True)

categorical_vars = df.select_dtypes(include=['object']).columns
results_df = analyze_multiple_pairs(df, categorical_vars)
results_df = results_df[results_df['Variable 1'] == 'Gender']
results_df

Unnamed: 0,Variable 1,Variable 2,Chi-Square,p-value,Degrees of Freedom,Cramér’s V,Interpretation
0,Gender,Smoking,0.273383,0.6010715,1,0.029744,Not Significant
1,Gender,Yellow_Fingers,13.165694,0.0002851211,1,0.206416,Significant Association
2,Gender,Anxiety,6.554782,0.01046027,1,0.145646,Significant Association
3,Gender,Peer_Pressure,22.373541,2.244449e-06,1,0.269084,Significant Association
4,Gender,Chronic Disease,12.129362,0.0004963411,1,0.198125,Significant Association
5,Gender,Fatigue,1.815584,0.1778398,1,0.076653,Not Significant
6,Gender,Allergy,6.743558,0.009408678,1,0.147729,Significant Association
7,Gender,Wheezing,5.605245,0.01790679,1,0.134685,Significant Association
8,Gender,Alcohol Consuming,61.947147,3.528006e-15,1,0.447746,Significant Association
9,Gender,Coughing,4.963429,0.02588882,1,0.126739,Significant Association


---

This analysis was performed by **Jabulente**, a passionate and dedicated data scientist with a strong commitment to using data to drive meaningful insights and solutions. For inquiries, collaborations, or further discussions, please feel free to reach out via.  

    
<div align="center">  
    
[![GitHub](https://img.shields.io/badge/GitHub-Jabulente-black?logo=github)](https://github.com/Jabulente)  [![LinkedIn](https://img.shields.io/badge/LinkedIn-Jabulente-blue?logo=linkedin)](https://linkedin.com/in/jabulente-208019349)  [![Email](https://img.shields.io/badge/Email-jabulente@hotmail.com-red?logo=gmail)](mailto:Jabulente@hotmail.com)  

</div>
