In [None]:
print("Jay Mahakal")

In [None]:
! pip install pandas matplotlib seaborn scipy numpy statsmodels scikit-learn prince plotly networkx matplotlib_venn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2_contingency
import numpy as np
import prince
import networkx as nx
from collections import Counter
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [None]:
data_01 = pd.read_csv("Urgent - Responses on Survey on AI Use and Adoption Shared.csv",header=0,encoding="cp1252")

In [None]:
data_01.sample(5)

# Assess the Association Between Entity Type and AI Adoption Status

In [None]:
df = data_01[["Type of Entity", "What is the current status of AI adoption in your organization?"]].copy()
df.columns = ["EntityType", "AIStatus"]

In [None]:
df.isna().sum()

Frequency Table

In [None]:
crosstab = pd.crosstab(df["EntityType"], df["AIStatus"])
print("Contingency Table:\n", crosstab)

## Cramer's V Calculation

In [None]:
n = crosstab.sum().sum()
phi2 = chi2 / n
r, k = crosstab.shape
cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
print(f"Cramer's V: {cramers_v:.3f}")

## Heatmap of Raw Counts

In [None]:
fig = px.imshow(
    crosstab,
    text_auto=True,  
    color_continuous_scale='Blues',
    labels=dict(x="AI Adoption Status", y="Entity Type", color="Count"),
    title="Heatmap: Entity Type vs AI Adoption Status",
    aspect="auto" 
)
fig.update_layout(
    xaxis_title="AI Adoption Status",
    yaxis_title="Entity Type",
    font=dict(size=14),
    title_font=dict(size=18),
    margin=dict(l=80, r=80, t=100, b=80)
)

fig.update_coloraxes(colorbar=dict(title="Count"))
fig.update_traces(hovertemplate="Entity Type: %{y}<br>AI Adoption Status: %{x}<br>Count: %{z}<extra></extra>")
fig.show(renderer="browser")

## Count Bar Plot

In [None]:
df_01 = crosstab.reset_index().melt(id_vars=crosstab.index.name, var_name='AI Adoption Status', value_name='Count')

fig = px.bar(
    df_01,
    x='EntityType',
    y='Count',
    color='AI Adoption Status',
    barmode='group', 
    title='AI Adoption by Entity Type',
    labels={'Count': 'Count', 'Entity Type': 'Entity Type', 'AI Adoption Status': 'AI Adoption Status'},
    height=500,
    width=900,
)

fig.update_layout(
    xaxis_tickangle=-45,
    font=dict(size=14),
    title_font=dict(size=18),
    margin=dict(l=80, r=50, t=80, b=120)
)
fig.show(renderer="browser")

## Proportional Bar Plot

In [None]:
proportions = crosstab.div(crosstab.sum(axis=1), axis=0)
df_prop = proportions.reset_index().melt(id_vars=crosstab.index.name or 'Entity Type',
                                        var_name='AI Adoption Status',
                                        value_name='Proportion')

fig = px.bar(
    df_prop,
    x='EntityType',
    y='Proportion',
    color='AI Adoption Status',
    title='Proportions of AI Adoption by Entity Type',
    labels={'Proportion': 'Proportion', 'Entity Type': 'Entity Type', 'AI Adoption Status': 'AI Adoption Status'},
    barmode='stack',
    color_continuous_scale=px.colors.sequential.Viridis,
    height=500,
    width=900
)

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis=dict(tickformat=".0%"), 
    font=dict(size=14),
    title_font=dict(size=18),
    margin=dict(l=80, r=50, t=80, b=120)
)
fig.show(renderer="browser")

##  Mosaic Plot

In [None]:
fig = px.treemap(
    df,
    path=['EntityType', 'AIStatus'],  
    title="Treemap: Entity Type vs AI Adoption Status",
    values=None,  
    color='AIStatus',  
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig.update_layout(
    margin=dict(t=50, l=25, r=25, b=25),
    title_font=dict(size=18)
)
fig.show(renderer="browser")

## Correspondence Analysis

In [None]:
ca = prince.CA(n_components=2, random_state=42).fit(crosstab)
row_coords = ca.row_coordinates(crosstab)
col_coords = ca.column_coordinates(crosstab)

## Visualize Correspondence Analysis

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=row_coords.iloc[:, 0],
    y=row_coords.iloc[:, 1],
    mode='markers+text',
    name='Entity Type',
    text=row_coords.index,
    textposition='top center',
    marker=dict(color='blue', size=10),
    textfont=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=col_coords.iloc[:, 0],
    y=col_coords.iloc[:, 1],
    mode='markers+text',
    name='AI Status',
    text=col_coords.index,
    textposition='top center',
    marker=dict(color='red', size=10),
    textfont=dict(color='red')
))

fig.add_shape(type="line", x0=min(row_coords.iloc[:, 0].min(), col_coords.iloc[:, 0].min()),
                      y0=0, x1=max(row_coords.iloc[:, 0].max(), col_coords.iloc[:, 0].max()), y1=0,
              line=dict(color="grey", width=1))
fig.add_shape(type="line", x0=0, y0=min(row_coords.iloc[:, 1].min(), col_coords.iloc[:, 1].min()),
                      x1=0, y1=max(row_coords.iloc[:, 1].max(), col_coords.iloc[:, 1].max()),
              line=dict(color="grey", width=1))

fig.update_layout(
    title="Correspondence Analysis",
    xaxis_title="Dimension 1",
    yaxis_title="Dimension 2",
    xaxis=dict(zeroline=False),
    yaxis=dict(zeroline=False),
    width=800,
    height=600,
    legend=dict(x=0.85, y=0.95),
    template="plotly_white",
    dragmode="pan"
)

fig.show(renderer="browser")

# Evaluate Differences in AI Maturity Levels Across Drivers of Adoption --- Not Done

In [None]:
data_01[["How would you categorize your current AI usage maturity level?"]]

In [None]:
data_01[["What are the primary drivers for adopting AI in your organization? (Select all that apply)"]]

In [None]:
df = data_01[["How would you categorize your current AI usage maturity level?",
"What are the primary drivers for adopting AI in your organization? (Select all that apply)"]].copy()

In [None]:
df.columns = ["AIMaturity", "Drivers"]

In [None]:
df = df.dropna(subset=["AIMaturity", "Drivers"])

## Transform Multi-Select Drivers → Binary Columns

In [None]:
driver_dummies_01 = df["Drivers"].str.get_dummies(sep=';')
driver_dummies_02 = df["AIMaturity"].str.get_dummies(sep=';')

In [None]:
df_expanded = pd.concat([driver_dummies_01, driver_dummies_02], axis=1)

In [None]:
print("\nExpanded Data (first rows):\n", df_expanded.head())

In [None]:
for driver in driver_dummies.columns:
    crosstab = pd.crosstab(df_expanded["AIMaturity"], df_expanded[driver])
    chi2, p, dof, expected = chi2_contingency(crosstab)
    print(f"\nDriver: {driver}")
    print("Chi-square Statistic:", chi2, " p-value:", p)

In [None]:
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2/min(k-1, r-1))


def interpret_cramers_v(value):
    if value < 0.1:
        return "Negligible"
    elif value < 0.3:
        return "Small"
    elif value < 0.5:
        return "Medium"
    else:
        return "Large"


for driver in driver_dummies.columns:
    crosstab = pd.crosstab(df_expanded["AIMaturity"], df_expanded[driver])
    cv = cramers_v(crosstab)
    interpretation = interpret_cramers_v(cv)
    print(f"Cramer's V for {driver}: {round(cv, 3)} ({interpretation} association)")

In [None]:
formula = "AIMaturity ~ " + " + ".join(driver_dummies.columns)
model = smf.mnlogit(formula, data=df_expanded)
result = model.fit(method='newton', maxiter=100, disp=False)
print("\nMultinomial Logistic Regression Summary:\n", result.summary())

In [None]:
crosstab_all = pd.crosstab(df_expanded["AIMaturity"], df_expanded[driver_dummies.columns].idxmax(axis=1))
ca = prince.CA(n_components=2, random_state=42)
ca = ca.fit(crosstab_all)
ca.plot_coordinates(X=crosstab_all, figsize=(8,6), show_row_labels=True, show_col_labels=True)
plt.title("Correspondence Analysis: AI Maturity vs Drivers")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
prop_table = df_expanded.groupby("AIMaturity")[driver_dummies.columns].mean()
sns.heatmap(prop_table, annot=True, cmap="YlGnBu")
plt.title("Proportion of Entities Selecting Each Driver by AI Maturity")
plt.xlabel("Drivers")
plt.ylabel("AI Maturity")
plt.show()

In [None]:
prop_table.T.plot(kind='bar', stacked=True, figsize=(12,6), colormap="viridis")
plt.title("Driver Distribution Across AI Maturity Levels")
plt.ylabel("Proportion")
plt.xticks(rotation=45)
plt.show()

# Investigate the Relationship Between Key Challenges and Risks

In [None]:
df_challenges = data_01["What are the key challenges your organization faces in implementing AI? (Select all that apply)\n"].copy()

In [None]:
df_risks = data_01["What risks do you associate with AI adoption? (Select all that apply)"].copy()

In [None]:
def indicator_matrix(series, prefix):
    series = series.fillna("").astype(str).apply(lambda x: [s.strip() for s in x.split(";") if s.strip()])
    uniq = sorted(set(v for lst in series for v in lst))
    out = pd.DataFrame(index=series.index)
    for u in uniq:
        colname = f"{prefix}__{u.replace(' ', '_')}"
        out[colname] = series.apply(lambda lst: int(u in lst))
    return out

challenges_bin = indicator_matrix(df_challenges, "Challenge")
risks_bin = indicator_matrix(df_risks, "Risk")


In [None]:
print(challenges_bin.head())
print(risks_bin.head())

In [None]:
phi_mat = pd.DataFrame(index=challenges_bin.columns, columns=risks_bin.columns, dtype=float)

for c in challenges_bin.columns:
    for r in risks_bin.columns:
        tab = pd.crosstab(challenges_bin[c], risks_bin[r])
        if tab.shape == (2,2):
            chi2, p, dof, exp = chi2_contingency(tab)
            n = tab.values.sum()
            phi = np.sqrt(chi2/n)
            phi_mat.loc[c,r] = phi

# Heatmap of phi coefficients
plt.figure(figsize=(12,8))
sns.heatmap(phi_mat.astype(float), cmap="coolwarm", center=0, annot=True, fmt=".2f")
plt.title("Phi Correlation Matrix: Challenges vs Risks")
plt.show()


In [None]:
from statsmodels.stats.contingency_tables import mcnemar

In [None]:
challenge_col = 'Challenge__Data_privacy_and_cybersecurity_concerns'
risk_col = 'Risk__Data_security_and_privacy_breaches'

tab = pd.crosstab(challenges_bin[challenge_col], risks_bin[risk_col])
result = mcnemar(tab, exact=True)

print("McNemar’s test")
print("Table:\n", tab)
print("statistic:", result.statistic, "p-value:", result.pvalue)


In [None]:
from matplotlib_venn import venn2

a = challenges_bin[challenge_col].sum()       # challenge selected
b = risks_bin[risk_col].sum()                 # risk selected
ab = ((challenges_bin[challenge_col]==1) & (risks_bin[risk_col]==1)).sum()

venn2(subsets=(a-ab, b-ab, ab), set_labels=(challenge_col, risk_col))
plt.title("Overlap between Challenge and Risk")
plt.show()


In [None]:
from sklearn.metrics import jaccard_score

jac_mat = pd.DataFrame(index=challenges_bin.columns, columns=risks_bin.columns, dtype=float)

for c in challenges_bin.columns:
    for r in risks_bin.columns:
        jac_mat.loc[c, r] = jaccard_score(challenges_bin[c], risks_bin[r])

plt.figure(figsize=(12,8))
sns.heatmap(jac_mat.astype(float), cmap="Greens", annot=True, fmt=".2f")
plt.title("Jaccard Similarity Matrix: Challenges vs Risks")
plt.show()


In [None]:
# Build graph based on Phi correlations (or Jaccard)
G = nx.Graph()

# add nodes separately for clarity
for c in challenges_bin.columns:
    G.add_node(c, type="Challenge")
for r in risks_bin.columns:
    G.add_node(r, type="Risk")

# add edges when association is strong
for c in challenges_bin.columns:
    for r in risks_bin.columns:
        phi = phi_mat.loc[c, r]
        if pd.notna(phi) and phi > 0.25:   # threshold for clarity
            G.add_edge(c, r, weight=phi)

plt.figure(figsize=(12,10))
pos = nx.spring_layout(G, k=0.4, seed=42)
edge_weights = [d["weight"]*5 for (_,_,d) in G.edges(data=True)]  # scale for visibility

nx.draw_networkx_nodes(G, pos,
                       node_color=["lightblue" if G.nodes[n]["type"]=="Challenge" else "lightcoral"
                                   for n in G.nodes],
                       node_size=800)
nx.draw_networkx_edges(G, pos, width=edge_weights, alpha=0.6)
nx.draw_networkx_labels(G, pos, font_size=8)

plt.title("Challenge–Risk Network Graph (edges = Phi strength > 0.25)")
plt.axis("off")
plt.show()


In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

# Merge challenges + risks into one binary feature space
combined = pd.concat([challenges_bin, risks_bin], axis=1)

# Hierarchical clustering
Z = linkage(combined.T, method='ward')  # cluster on transposed matrix (features)

plt.figure(figsize=(12,6))
dendrogram(Z, labels=combined.columns, leaf_rotation=90)
plt.title("Hierarchical Clustering of Challenges and Risks")
plt.tight_layout()
plt.show()


In [None]:
import prince

mca = prince.MCA(n_components=2, random_state=42)
mca = mca.fit(combined)

coords = mca.column_coordinates(combined)

plt.figure(figsize=(10,8))
plt.scatter(coords[0], coords[1])

for i, txt in enumerate(coords.index):
    plt.annotate(txt, (coords.iloc[i,0], coords.iloc[i,1]), fontsize=8)

plt.title("MCA: Challenges & Risks in 2D")
plt.axhline(0, color='gray', lw=0.5)
plt.axvline(0, color='gray', lw=0.5)
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Standardize
X = StandardScaler().fit_transform(combined)

# Fit KMeans (try k=3, adjust as needed)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)

data_01["Cluster"] = labels

print(data_01.groupby("Cluster").size())

# Visualize clusters on MCA 2D projection
plt.figure(figsize=(10,8))
plt.scatter(coords[0], coords[1], c=['red','blue','green']*10)  # naive coloring, adjust to labels
plt.title("Clusters of Organizations by Challenges/Risks")
plt.show()


# Compare Proportions of AI Trends Exploration by Maturity Level

In [None]:
df = data_01[["Which emerging AI trends is your organization exploring? (Select all that apply)", "How would you categorize your current AI usage maturity level?"]]

In [None]:
df.columns = ["Trends", "AIMaturity"]

In [None]:
# Reuse your indicator function
trends_bin = indicator_matrix(df["Trends"], prefix="Trend")

# Attach maturity
trends_bin["AIMaturity"] = df["AIMaturity"].values


In [None]:
# Compute proportion of each trend within each maturity group
trend_props = trends_bin.groupby("AIMaturity").mean().T
print(trend_props.head())

In [None]:
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import multipletests

results = []
levels = trends_bin["AIMaturity"].unique()

for trend in [c for c in trends_bin.columns if c.startswith("Trend__")]:
    for i in range(len(levels)):
        for j in range(i+1, len(levels)):
            g1, g2 = levels[i], levels[j]
            x1 = trends_bin.loc[trends_bin["AIMaturity"]==g1, trend].sum()
            n1 = (trends_bin["AIMaturity"]==g1).sum()
            x2 = trends_bin.loc[trends_bin["AIMaturity"]==g2, trend].sum()
            n2 = (trends_bin["AIMaturity"]==g2).sum()

            stat, pval = proportions_ztest([x1,x2], [n1,n2])
            results.append((trend, g1, g2, stat, pval))

# Adjust for multiple comparisons (Bonferroni)
df_results = pd.DataFrame(results, columns=["Trend","Group1","Group2","Z","pval"])
df_results["pval_adj"] = multipletests(df_results["pval"], method="bonferroni")[1]
print(df_results.head())


In [None]:
for trend in [c for c in trends_bin.columns if c.startswith("Trend__")]:
    tab = pd.crosstab(trends_bin["AIMaturity"], trends_bin[trend])
    chi2, p, dof, exp = chi2_contingency(tab)
    if p < 0.05:
        print(f"{trend}: chi2={chi2:.2f}, p={p:.3f}")


In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=trends_bin.melt(id_vars="AIMaturity", var_name="Trend", value_name="Selected"),
            x="Trend", y="Selected", hue="AIMaturity", ci=95)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Proportion selected")
plt.title("AI Trend Exploration by Maturity Level")
plt.tight_layout()
plt.show()


# Governance Frameworks → Audit Processes

In [None]:
# Any framework selected = 1, else 0
frameworks_any = data_01["What governance frameworks are in place for AI? (Select all that apply)"].fillna("").apply(lambda x: int(len(str(x).strip()) > 0))
audits = data_01["Do you have an established process to audit and review AI models regularly for fairness, bias, and ethical concerns?"].fillna("No")

In [None]:
# Collapse audit answers to Yes vs Not-Yes (you can refine)
audit_yes = audits.apply(lambda x: 1 if "Yes" in x else 0)

tab = pd.crosstab(frameworks_any, audit_yes)
print("Contingency Table:\n", tab)


In [None]:
from scipy.stats import fisher_exact

oddsratio, pval = fisher_exact(tab)
print(f"Fisher’s Exact Test: OR={oddsratio:.2f}, p={pval:.4f}")


In [None]:
from statsmodels.graphics.mosaicplot import mosaic

mosaic_data = {(f"Frameworks={i}", f"Audit={j}"): tab.loc[i,j] for i in tab.index for j in tab.columns}

plt.figure(figsize=(8,6))
mosaic(mosaic_data, title="Governance Frameworks vs Audit Process")
plt.show()


In [None]:
import seaborn as sns

df_plot = pd.DataFrame({
    "Frameworks": frameworks_any.map({0:"No Frameworks", 1:"Frameworks"}),
    "Audit": audits
})

plt.figure(figsize=(8,6))
sns.countplot(data=df_plot, x="Audit", hue="Frameworks")
plt.title("Audit Process by Governance Framework Presence")
plt.ylabel("Count")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
def indicator_matrix(series, prefix):
    series = series.fillna("").astype(str).apply(lambda x: [s.strip() for s in x.split(";") if s.strip()])
    uniq = sorted(set(v for lst in series for v in lst))
    out = pd.DataFrame(index=series.index)
    for u in uniq:
        colname = f"{prefix}__{u.replace(' ', '_')}"
        out[colname] = series.apply(lambda lst: int(u in lst))
    return out

frameworks_bin = indicator_matrix(data_01["What governance frameworks are in place for AI? (Select all that apply)"], "Framework")


In [None]:
audit_yes = audits.apply(lambda x: 1 if "Yes" in str(x) else 0)

In [None]:
from scipy.stats import fisher_exact

results = []
for fw in frameworks_bin.columns:
    tab = pd.crosstab(frameworks_bin[fw], audit_yes)
    if tab.shape == (2,2):
        or_val, pval = fisher_exact(tab)
        results.append((fw, tab.iloc[1,1], tab.iloc[1,0], or_val, pval))

df_fw_results = pd.DataFrame(results, columns=["Framework", "Audit_Yes", "Audit_No", "OddsRatio", "pval"])
print(df_fw_results.sort_values("pval"))


In [None]:
plt.figure(figsize=(8,6))
sns.barplot(data=df_fw_results, x="OddsRatio", y="Framework", hue=(df_fw_results["pval"]<0.05))
plt.axvline(1, color="red", linestyle="--")
plt.title("Odds Ratios: Governance Framework vs Audit (Yes)")
plt.xlabel("Odds Ratio (Audit=Yes)")
plt.ylabel("")
plt.legend(title="Significant (p<0.05)")
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

results = []
for fw in frameworks_bin.columns:
    tab = pd.crosstab(frameworks_bin[fw], audit_yes)
    if tab.shape == (2,2):
        or_val, pval = fisher_exact(tab)
        results.append((fw, tab.iloc[1,1], tab.iloc[1,0], or_val, pval))

df_fw_results = pd.DataFrame(results, 
    columns=["Framework", "Audit_Yes", "Audit_No", "OddsRatio", "pval"])

# --- Adjust for multiple tests ---
df_fw_results["pval_bonf"] = multipletests(df_fw_results["pval"], method="bonferroni")[1]
df_fw_results["pval_fdr"]  = multipletests(df_fw_results["pval"], method="fdr_bh")[1]

print(df_fw_results.sort_values("pval"))


In [None]:
plt.figure(figsize=(8,6))
sns.barplot(data=df_fw_results, 
            x="OddsRatio", y="Framework", 
            hue=(df_fw_results["pval_fdr"]<0.05))
plt.axvline(1, color="red", linestyle="--")
plt.title("Odds Ratios: Governance Framework vs Audit (Yes)")
plt.xlabel("Odds Ratio (Audit=Yes)")
plt.ylabel("")
plt.legend(title="Significant (FDR<0.05)")
plt.tight_layout()
plt.show()


In [None]:
audit_status = data_01["Do you have an established process to audit and review AI models regularly for fairness, bias, and ethical concerns?"].fillna("No").replace({
    "Yes": "Yes",
    "In Progress": "In Progress",
    "Not evaluated as yet": "No",
    "No": "No"
})


In [None]:
audit_status

In [None]:
tab_multi = pd.crosstab(frameworks_any, audit_status)
print("Contingency Table:\n", tab_multi)

chi2, p, dof, exp = chi2_contingency(tab_multi)
print(f"Chi-square test: chi2={chi2:.2f}, p={p:.4f}, dof={dof}")


In [None]:
import statsmodels.api as sm

df_reg = pd.DataFrame({
    "Frameworks": frameworks_any,
    "Audit": audit_status
})

# Encode Audit as categorical with "No" as baseline
y = pd.Categorical(df_reg["Audit"], categories=["No", "In Progress", "Yes"])
X = sm.add_constant(df_reg["Frameworks"])

model = sm.MNLogit(y.codes, X).fit(disp=0)
print(model.summary())


In [None]:
plt.figure(figsize=(8,6))
sns.catplot(data=df_reg, x="Audit", hue="Frameworks", kind="count", height=6, aspect=1.3)
plt.title("Audit Status by Governance Framework Presence")
plt.xticks(rotation=45)
plt.show()


In [None]:
frameworks_bin = indicator_matrix(data_01["What governance frameworks are in place for AI? (Select all that apply)"], prefix="Framework")

In [None]:
import statsmodels.api as sm

results = []

for fw in frameworks_bin.columns:
    df_reg = pd.DataFrame({
        "Framework": frameworks_bin[fw],
        "Audit": audit_status
    })
    # Encode Audit as categorical with "No" as baseline
    y = pd.Categorical(df_reg["Audit"], categories=["No", "In Progress", "Yes"])
    X = sm.add_constant(df_reg["Framework"])
    
    try:
        model = sm.MNLogit(y.codes, X).fit(disp=0)
        params = model.params.loc[1]  # coefficients for "Framework=1"
        conf = model.conf_int().loc[1]
        odds = np.exp(params)
        odds_ci = np.exp(conf)
        
        results.append({
            "Framework": fw,
            "Category": "In Progress",
            "OR": odds[0], "CI_low": odds_ci[0][0], "CI_high": odds_ci[0][1], "pval": model.pvalues.loc[1][0]
        })
        results.append({
            "Framework": fw,
            "Category": "Yes",
            "OR": odds[1], "CI_low": odds_ci[1][0], "CI_high": odds_ci[1][1], "pval": model.pvalues.loc[1][1]
        })
    except Exception as e:
        print(f"Skipped {fw} due to error: {e}")

df_mnlogit = pd.DataFrame(results)
print(df_mnlogit.sort_values("pval"))


In [None]:
from statsmodels.stats.multitest import multipletests

df_mnlogit["pval_fdr"] = multipletests(df_mnlogit["pval"], method="fdr_bh")[1]
df_mnlogit["pval_bonf"] = multipletests(df_mnlogit["pval"], method="bonferroni")[1]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=df_mnlogit, x="OR", y="Framework", hue="Category", palette="Set1")
plt.axvline(1, color="black", linestyle="--")
plt.title("Odds Ratios: Frameworks vs Audit Outcomes")
plt.xlabel("Odds Ratio (relative to 'No')")
plt.ylabel("")
plt.tight_layout()
plt.show()


---

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.stats import chi2_contingency, fisher_exact

In [3]:
data_01 = pd.read_csv("Urgent - Responses on Survey on AI Use and Adoption Shared.csv",header=0,encoding="cp1252")

## Reseach Question : Assess the Association Between Entity Type and AI Adoption Status

In [None]:
df = data_01[[
    "Type of Entity",
    "What is the current status of AI adoption in your organization?"
]]

In [None]:
df.columns = ["entity_type", "ai_status"]

In [None]:
contingency_table = pd.crosstab(df["entity_type"], df["ai_status"])

**Cramér's V (effect size)**

Cramér's V is a statistical measure of association or effect size between two categorical variables. It gives you an idea of how strongly the two variables are related in a contingency table.

Reason for Choosing:
1. it's independent of sample size and we have less sample size.
2. A value between 0 and 1 (effect size)
3. You want to know the practical importance of the association

In [None]:
n = contingency_table.sum().sum()
phi2 = chi2 / n
r, k = contingency_table.shape
cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))

In [None]:
print(f"Cramér's V = {cramers_v:.3f}")

Result : 0.402 indicates a moderately strong association between the two categorical variables.

## Heatmap of counts

In [None]:
fig = px.imshow(
    contingency_table,
    text_auto=True,
    color_continuous_scale="Blues",
    labels=dict(x="AI Status", y="Entity Type", color="Count"),
    title="Entity Type vs AI Adoption Status (Counts)"
)
fig.show(renderer="notebook")

## Proportional Bar Plot

In [None]:
proportions = contingency_table.div(contingency_table.sum(axis=1), axis=0)
proportions_reset = proportions.reset_index().melt(
    id_vars="entity_type", var_name="AI Status", value_name="Proportion")

In [None]:
import plotly.express as px

fig = px.bar(
    proportions_reset,
    x="entity_type",
    y="Proportion",
    color="AI Status",
    barmode="group",  # or "stack" if you want stacked bars
    text=proportions_reset["Proportion"].apply(lambda x: f"{x:.0%}")
)

fig.update_layout(
    title="Proportion of AI Adoption Status by Entity Type",
    xaxis_title="Entity Type",
    yaxis_title="Proportion",
    xaxis_tickangle=-30,
    template="plotly_white",  # Use a clean theme
    legend_title="AI Status",
    height=600,
    width=900,
)

fig.update_yaxes(tickformat=".0%", range=[0, 1])
fig.update_traces(textposition='outside')  # Show % above bars
fig.show(renderer="notebook")


# Reseach Question : Evaluate Differences in AI Maturity Levels Across Drivers of Adoption

In [None]:
df = data_01[[
    "How would you categorize your current AI usage maturity level?",
    "What are the primary drivers for adopting AI in your organization? (Select all that apply)"
]]


In [None]:
df.columns = ["ai_maturity", "drivers"]

*multi-select drivers into binary indicator columns*

In [None]:
all_drivers = sorted(
    set(
        d.strip()
        for entry in df["drivers"].dropna()
        for d in str(entry).split(";")
        if d.strip() != ""
    )
)

for driver in all_drivers:
    df[driver] = df["drivers"].apply(
        lambda x: 1 if driver in str(x) else 0
    )

print("Drivers expanded into binary columns:", all_drivers)

In [None]:
cramers_results = []

for driver in all_drivers:
    contingency = pd.crosstab(df["ai_maturity"], df[driver])
    chi2, _, _, _ = chi2_contingency(contingency)
    n = contingency.sum().sum()
    phi2 = chi2 / n
    r, k = contingency.shape
    cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
    if cramers_v < 0.1:
        interpretation = "Negligible"
    elif cramers_v < 0.3:
        interpretation = "Small"
    elif cramers_v < 0.5:
        interpretation = "Medium"
    else:
        interpretation = "Large"

    cramers_results.append({
        "Driver": driver,
        "Cramers_V": round(cramers_v, 3),
        "Effect": interpretation
    })

cramers_df = pd.DataFrame(cramers_results).sort_values("Cramers_V", ascending=False)
print("\nCramér’s V results:\n", cramers_df)


In [None]:
driver_means = df.groupby("ai_maturity")[all_drivers].mean().round(2).reset_index()
driver_means_melted = driver_means.melt(id_vars="ai_maturity", var_name="Driver", value_name="Proportion")
fig = px.imshow(
    driver_means.set_index("ai_maturity").T,
    text_auto=True,
    aspect="auto",
    color_continuous_scale="YlGnBu",
    labels=dict(x="AI Maturity Level", y="Driver", color="Proportion")
)

fig.update_layout(
    title="Proportion of Organizations Selecting Each Driver by AI Maturity",
    xaxis_title="AI Maturity Level",
    yaxis_title="Driver",
    height=600,
    width=1000,
    font=dict(size=12),
)

fig.show(renderer="notebook")

# Research Question: Investigate the Relationship Between Key Challenges and Risks

In [4]:
df = data_01[[
    "What risks do you associate with AI adoption? (Select all that apply)",
    'What are the key challenges your organization faces in implementing AI? (Select all that apply)\n'
]]

In [5]:
df.columns = ["risks", "challenges"]

*multi-select drivers into binary indicator columns*

In [6]:
all_risks = sorted(
    set(
        r.strip()
        for entry in df["risks"].dropna()
        for r in str(entry).split(";")
        if r.strip() != ""
    )
)

In [7]:
all_challenges = sorted(
    set(
        c.strip()
        for entry in df["challenges"].dropna()
        for c in str(entry).split(";")
        if c.strip() != ""
    )
)


In [8]:
for risk in all_risks:
    df[risk] = df["risks"].apply(
        lambda x: 1 if risk in str(x) else 0
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[risk] = df["risks"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[risk] = df["risks"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[risk] = df["risks"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

In [9]:
for challenge in all_challenges:
    df[challenge] = df["challenges"].apply(
        lambda x: 1 if challenge in str(x) else 0
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[challenge] = df["challenges"].apply(


In [10]:
print("Extracted Risks:", all_risks)

Extracted Risks: ['Bias and fairness issues', 'Cyber security', 'Data security and privacy breaches', 'Ethical and societal concerns', 'Lack of explainability in AI models', 'Lack of reliability on AI', 'NA', 'NOT APPLICABLE', 'Not Applicable', 'Not evaluated as yet', 'Not yet adapted but under consideration', 'Over-reliance on automation', 'Reliance on past data to predict the future', 'Successful implementaton and usage', 'We are not using AI', 'hallucinations', 'very early stage, will come to know gradually']


In [11]:
print("Extracted Challenges:", all_challenges)

Extracted Challenges: ['AI Not being used', 'Data privacy and cybersecurity concerns', 'Insufficient resources and funding', 'Interoperability with legacy systems', "It's at the initial stage at Global level", 'Lack of reliability on AI', 'Lack of skilled talent', 'NA', 'NOT APPLICABLE', 'None', 'Not Applicable', 'Not evaluated as yet', 'Not yet adapted but under consideration', 'Rapid technological changes', 'Still to determine use case', 'We are not using AI', 'not applicable', 'organization bureaucracy', 'very early stage, will come to know gradually']


In [12]:
results = []
for risk in all_risks:
    for challenge in all_challenges:
        contingency = pd.crosstab(df[risk], df[challenge])
        if contingency.shape == (2, 2): 
            oddsratio, p_value = fisher_exact(contingency)
            results.append({
                "Risk": risk,
                "Challenge": challenge,
                "OddsRatio": round(oddsratio, 2),
                "p_value": round(p_value, 4)
            })

In [13]:
results_df = pd.DataFrame(results)
print("\nFisher's Exact Test Results:\n", results_df.head(15))


Fisher's Exact Test Results:
                         Risk                                  Challenge  \
0   Bias and fairness issues                          AI Not being used   
1   Bias and fairness issues    Data privacy and cybersecurity concerns   
2   Bias and fairness issues         Insufficient resources and funding   
3   Bias and fairness issues       Interoperability with legacy systems   
4   Bias and fairness issues  It's at the initial stage at Global level   
5   Bias and fairness issues                  Lack of reliability on AI   
6   Bias and fairness issues                     Lack of skilled talent   
7   Bias and fairness issues                                         NA   
8   Bias and fairness issues                             NOT APPLICABLE   
9   Bias and fairness issues                                       None   
10  Bias and fairness issues                             Not Applicable   
11  Bias and fairness issues                       Not evaluated as y

# Reseach Question : Analyze the Impact of Governance Frameworks on Audit Processes

In [15]:
df = data_01[[
    "What governance frameworks has your organization implemented? (Select all that apply)",
    "Does your organization have established AI audit processes?"
]]

KeyError: "None of [Index(['What governance frameworks has your organization implemented? (Select all that apply)', 'Does your organization have established AI audit processes?'], dtype='object')] are in the [columns]"