Reverse-coding

In [None]:
import pandas as pd


file_path = 'Pollfish_Survey_Work_behaviours_survey_393485244_2.xlsx'
data_sheet = 'Pollfish_Survey_Work_behaviours'
index_sheet = 'Index'     # note the capital “I”


df = pd.read_excel(file_path, sheet_name=data_sheet)

idx_raw = pd.read_excel(file_path, sheet_name=index_sheet, header=None)

#      Keep only the first 12 columns, which correspond to:
#    [0] Question coding (e.g. Q1)
#    [1] Question text
#    [8] Input vs Target flag (“I” or “T”)
#    [9] Reverse‑coded flag (“R” if reverse)
#    [10] Category
#    [11] Possible answers (e.g. “1‑6”)
idx = idx_raw.iloc[:, :12].copy()
idx.columns = [
    'Code', 'Question',
    'Skip2', 'Skip3', 'Skip4', 'Skip5', 'Skip6', 'Skip7',
    'IorT', 'Reverse', 'Category', 'PossibleAnswers'
]

# Build lists of codes
to_reverse   = idx.loc[idx['Reverse'] == 'R', 'Code'].tolist()
inputs       = idx.loc[idx['IorT'] == 'I', 'Code'].tolist()
targets      = idx.loc[idx['IorT'] == 'T', 'Code'].tolist()

# Apply reverse‑coding (1–6 scales → 7 - x) in the DataFrame
for q in to_reverse:
    df[q] = 7 - df[q]


print("Will reverse‑code these questions:", to_reverse)
print("Inputs:", inputs[:5], "…")
print("Targets:", targets[:5], "…")


Will reverse‑code these questions: ['Q1', 'Q17', 'Q22', 'Q23', 'Q25', 'Q45']
Inputs: ['Q5', 'Q6', 'Q10', 'Q11', 'Q12'] …
Targets: ['Q1', 'Q2', 'Q3', 'Q4', 'Q7'] …


In [3]:
df.drop(index=df[df['Q30'] != 3].index, inplace=True)
# List unique values
print(df['Q30'].unique())    # should print only [3]

# Or get counts
print(df['Q30'].value_counts())



[3]
Q30
3    178
Name: count, dtype: int64


EDA

In [4]:
# Map each Q‑code to its full question text
question_map = idx.set_index('Code')['Question'].to_dict()


histograms

In [None]:
# Assumes I have already loaded `df` (survey data) and `idx` (index DataFrame)
import os
import numpy as np
import matplotlib.pyplot as plt
import textwrap

# 1. Build lookup maps from idx
question_map     = idx.set_index('Code')['Question'].to_dict()
input_target_map = idx.set_index('Code')['IorT'].to_dict()
category_map     = idx.set_index('Code')['Category'].to_dict()
reverse_map      = idx.set_index('Code')['Reverse'].to_dict()
possible_map     = idx.set_index('Code')['PossibleAnswers'].to_dict()

# 2. Identify input and target codes
inputs  = idx.loc[idx['IorT'] == 'I', 'Code'].tolist()
targets = idx.loc[idx['IorT'] == 'T', 'Code'].tolist()

# 3. Prepare output directories
dirs = {
    'inputs':  'histograms_inputs',
    'targets': 'histograms_targets',
}
for d in dirs.values():
    os.makedirs(d, exist_ok=True)

# 4. Function to plot with counts on bars
def save_histograms(codes, output_dir):
    for code in codes:
        flag      = input_target_map.get(code, '')
        label     = 'Input' if flag == 'I' else 'Target'
        rev_label = '; Reversed' if reverse_map.get(code) == 'R' else ''
        cat       = category_map.get(code, '')
        qtxt      = question_map.get(code, code)
        
        # Title
        header     = f"{code} ({label}{rev_label}) – {cat}"
        full_title = header + "\n" + qtxt
        wrapped    = "\n".join(textwrap.wrap(full_title, width=80))
        
        data = df[code].dropna()
        plt.figure(figsize=(8,6))
        
        # Choose bins
        if possible_map.get(code, '').strip() == '1-6':
            bins = np.arange(1, 8) - 0.5
            xticks = range(1,7)
        else:
            bins = 10
            xticks = None
        
        counts, bin_edges, patches = plt.hist(data, bins=bins, rwidth=0.8)
        if xticks:
            plt.xticks(xticks)
        
        # Annotate counts on each bar
        for patch, count in zip(patches, counts):
            x = patch.get_x() + patch.get_width() / 2
            plt.text(x, count, str(int(count)), ha='center', va='bottom')
        
        plt.xlabel('Response')
        plt.ylabel('Count')
        plt.title(wrapped)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/{code}.png")
        plt.close()

# 5. Generate histograms for inputs and targets
save_histograms(inputs,  dirs['inputs'])
save_histograms(targets, dirs['targets'])

print(f"Saved {len(inputs)} input histograms to ./{dirs['inputs']}/")
print(f"Saved {len(targets)} target histograms to ./{dirs['targets']}/")


Saved 34 input histograms to ./histograms_inputs/
Saved 49 target histograms to ./histograms_targets/


In [None]:
# --- Keep only Q1 through Q46 in your DataFrame and related lists ---

# 1. Define the codes to keep
keep_codes = [f"Q{i}" for i in range(1, 47)]

# 2. Subset main DataFrame
df = df[keep_codes].copy()

# 3. Update inputs and targets lists
inputs  = [c for c in inputs  if c in keep_codes]
targets = [c for c in targets if c in keep_codes]

# 4. (Optional) Also subset your index DataFrame if you use it downstream
idx = idx[idx['Code'].isin(keep_codes)].copy()

# 5. Rebuild lookup maps need
question_map     = idx.set_index('Code')['Question'].to_dict()
input_target_map = idx.set_index('Code')['IorT'].to_dict()
category_map     = idx.set_index('Code')['Category'].to_dict()
reverse_map      = idx.set_index('Code')['Reverse'].to_dict()
possible_map     = idx.set_index('Code')['PossibleAnswers'].to_dict()

print(f"DataFrame now has columns: {df.columns.tolist()}")
print(f"Inputs: {inputs}")
print(f"Targets: {targets}")


DataFrame now has columns: ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31', 'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44', 'Q45', 'Q46']
Inputs: ['Q5', 'Q6', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q16', 'Q17', 'Q19', 'Q20', 'Q23', 'Q24', 'Q25', 'Q28', 'Q29', 'Q34', 'Q35', 'Q36', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44', 'Q45']
Targets: ['Q1', 'Q2', 'Q3', 'Q4', 'Q7', 'Q8', 'Q9', 'Q15', 'Q18', 'Q21', 'Q22', 'Q26', 'Q27', 'Q31', 'Q32', 'Q33', 'Q37', 'Q46']


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import textwrap

# === ASSUME AGAIN df and idx are already loaded ===


# Rebuild input/target lists and lookup maps
inputs  = idx[idx['IorT'] == 'I']['Code'].tolist()
targets = idx[idx['IorT'] == 'T']['Code'].tolist()
question_map     = idx.set_index('Code')['Question'].to_dict()
input_target_map = idx.set_index('Code')['IorT'].to_dict()
category_map     = idx.set_index('Code')['Category'].to_dict()
reverse_map      = idx.set_index('Code')['Reverse'].to_dict()
possible_map     = idx.set_index('Code')['PossibleAnswers'].to_dict()

# 1. Create output directories
os.makedirs('eda_results', exist_ok=True)
os.makedirs('heatmaps', exist_ok=True)

# 2. Compute EDA tables
missing_df = (
    df[inputs + targets]
    .isna()
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)
missing_df.columns = ['Variable', 'MissingPercent']

desc_df = df[inputs].describe()

corr_df = df[inputs + targets].corr()

# 3. Save EDA tables to Excel
eda_excel_path = os.path.join('eda_results', 'eda_results.xlsx')
with pd.ExcelWriter(eda_excel_path) as writer:
    missing_df.to_excel(writer, sheet_name='Missingness', index=False)
    desc_df.to_excel(writer, sheet_name='Descriptives')
    corr_df.to_excel(writer, sheet_name='Correlations')

# 4. Plot and save correlation heatmap
fig, ax = plt.subplots(figsize=(12, 10))
cax = ax.imshow(corr_df, cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax, fraction=0.046, pad=0.04, ax=ax)
ax.set_xticks(np.arange(len(corr_df)))
ax.set_yticks(np.arange(len(corr_df)))
ax.set_xticklabels(corr_df.columns, rotation=90, fontsize=6)
ax.set_yticklabels(corr_df.index, fontsize=6)
ax.set_title('Correlation Heatmap: Q1–Q46')
plt.tight_layout()
heatmap_path = os.path.join('heatmaps', 'correlation_heatmap_q1_q46.png')
fig.savefig(heatmap_path, dpi=150)
plt.close(fig)

print(f"EDA tables saved to {eda_excel_path}")
print(f"Correlation heatmap saved to {heatmap_path}")


EDA tables saved to eda_results\eda_results.xlsx
Correlation heatmap saved to heatmaps\correlation_heatmap_q1_q46.png


In [8]:
#Kmeans clustering

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Assumes df and inputs list are already defined

# 1. Prepare output directory
out_dir = 'cluster_results'
os.makedirs(out_dir, exist_ok=True)

# 2. Standardize inputs
X = df[inputs]
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

# 3. Elbow & Silhouette analysis
sse, silh = [], []
Klist = range(2, 12)
for k in Klist:
    km = KMeans(n_clusters=k, random_state=42).fit(X_scaled)
    sse.append(km.inertia_)
    silh.append(silhouette_score(X_scaled, km.labels_))

# 4. Save Elbow plot
plt.figure(figsize=(5,4))
plt.plot(Klist, sse, marker='o')
plt.xlabel('K'); plt.ylabel('SSE'); plt.title('Elbow Method')
plt.tight_layout()
plt.savefig(f'{out_dir}/elbow.png', dpi=150)
plt.close()

# 5. Save Silhouette plot
plt.figure(figsize=(5,4))
plt.plot(Klist, silh, marker='o')
plt.xlabel('K'); plt.ylabel('Avg Silhouette'); plt.title('Silhouette Analysis')
plt.tight_layout()
plt.savefig(f'{out_dir}/silhouette.png', dpi=150)
plt.close()

# 6. Choose best K and fit final model
best_k = Klist[np.argmax(silh)]
kmeans = KMeans(n_clusters=best_k, random_state=42).fit(X_scaled)
df['cluster'] = kmeans.labels_

# 7. PCA for 2D scatter
pca2 = PCA(n_components=2, random_state=42).fit(X_scaled)
pcs = pca2.transform(X_scaled)
df['PC1'], df['PC2'] = pcs[:,0], pcs[:,1]

# 8. Save PCA scatter plot
plt.figure(figsize=(6,5))
for c in range(best_k):
    pts = df[df['cluster']==c]
    plt.scatter(pts['PC1'], pts['PC2'], label=f'Cluster {c}', s=20)
plt.xlabel('PC1'); plt.ylabel('PC2'); plt.title('Cluster PCA Scatter')
plt.legend(); plt.tight_layout()
plt.savefig(f'{out_dir}/pca_scatter.png', dpi=150)
plt.close()

# 9. Compute & save cluster profiles
profiles = df.groupby('cluster')[inputs].mean().round(2)
profiles.to_excel(f'{out_dir}/cluster_profiles.xlsx', index=True)

print(f"Clustering done (K={best_k}).")
print(f"Files saved in '{out_dir}': elbow.png, silhouette.png, pca_scatter.png, cluster_profiles.xlsx")


Clustering done (K=2).
Files saved in 'cluster_results': elbow.png, silhouette.png, pca_scatter.png, cluster_profiles.xlsx


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput    import MultiOutputClassifier
from sklearn.pipeline       import Pipeline
from sklearn.preprocessing  import StandardScaler
from sklearn.ensemble       import RandomForestClassifier
from sklearn.metrics        import accuracy_score, f1_score
import pandas as pd

# 1. Define X and y
X = df[inputs]
y = df[targets]   # multi‐output DataFrame

# 2. Train / test split (no stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Build pipeline
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('clf', MultiOutputClassifier(
        RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    ))
])

# 4. Fit
pipeline.fit(X_train, y_train)

# 5. Predict
y_pred = pd.DataFrame(
    pipeline.predict(X_test),
    columns=targets,
    index=X_test.index
)

# 6. Evaluate
results = []
for col in targets:
    acc = accuracy_score(y_test[col], y_pred[col])
    f1  = f1_score(y_test[col], y_pred[col], average='macro')
    results.append({'Target': col, 'Accuracy': acc, 'F1_macro': f1})

results_df = pd.DataFrame(results)
results_df.to_excel('multioutput_classification_results.xlsx', index=False)
print(results_df)


   Target  Accuracy  F1_macro
0      Q1  0.361111  0.242308
1      Q2  0.472222  0.321270
2      Q3  0.527778  0.402412
3      Q4  0.472222  0.276405
4      Q7  0.222222  0.098765
5      Q8  0.388889  0.184052
6      Q9  0.333333  0.167246
7     Q15  0.444444  0.324359
8     Q18  0.388889  0.213276
9     Q21  0.388889  0.263149
10    Q22  0.222222  0.180122
11    Q26  0.388889  0.190789
12    Q27  0.472222  0.421967
13    Q31  0.555556  0.273275
14    Q32  0.694444  0.462963
15    Q33  0.500000  0.271230
16    Q37  0.666667  0.405556
17    Q46  0.500000  0.372161


In [16]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. Choose a target with very few minority samples, e.g. Q7
y_col = 'Q7'
X = df[inputs]
y = df[y_col]

# 2. Train/test split (no stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Build pipeline with SMOTE using k_neighbors small enough for your smallest class
#    Here we set k_neighbors=2 because y_train.min() class has only 3 samples
pipe = Pipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=2)),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# 4. Fit & evaluate
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.14      1.00      0.25         1
           3       0.33      0.36      0.35        11
           4       0.20      0.27      0.23        11
           5       0.00      0.00      0.00         7
           6       1.00      0.20      0.33         5

    accuracy                           0.25        36
   macro avg       0.28      0.31      0.19        36
weighted avg       0.31      0.25      0.23        36



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

# 1. Choose your target
y_col = 'Q7'  # replace with whichever question you're modeling
X = df[inputs]
y = df[y_col]

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Define two pipelines with different classifiers
pipelines = {
    'LogisticRegression': Pipeline([
        ('smote', SMOTE(random_state=42, k_neighbors=2)),
        ('clf', LogisticRegression(multi_class='multinomial', max_iter=1000, random_state=42))
    ]),
    'HistGradientBoosting': Pipeline([
        ('smote', SMOTE(random_state=42, k_neighbors=2)),
        ('clf', HistGradientBoostingClassifier(random_state=42))
    ]),
}

# 4. Fit and print classification reports
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== LogisticRegression ===
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.40      0.55      0.46        11
           4       0.00      0.00      0.00        11
           5       0.40      0.29      0.33         7
           6       0.00      0.00      0.00         5

    accuracy                           0.22        36
   macro avg       0.13      0.14      0.13        36
weighted avg       0.20      0.22      0.21        36


=== HistGradientBoosting ===
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.38      0.55      0.44        11
           4       0.18      0.18      0.18        11
           5       0.00      0.00      0.00         7
           6       0.50      0.20      0.29         5

    accuracy       

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# --- 1. Load and prepare data ---
file_path   = 'Pollfish_Survey_Work_behaviours_survey_393485244_2.xlsx'
data_sheet  = 'Pollfish_Survey_Work_behaviours'
idx_sheet   = 'Index'

df_full = pd.read_excel(file_path, sheet_name=data_sheet)
idx_raw = pd.read_excel(file_path, sheet_name=idx_sheet, header=None)
idx = idx_raw.iloc[:, :12].copy()
idx.columns = [
    'Code','Question','Skip2','Skip3','Skip4','Skip5','Skip6','Skip7',
    'IorT','Reverse','Category','PossibleAnswers'
]

# Keep only Q1–Q46
keep_codes = [f"Q{i}" for i in range(1, 47)]
df = df_full[keep_codes].copy()
idx = idx[idx['Code'].isin(keep_codes)].copy()
inputs = idx[idx['IorT'] == 'I']['Code'].tolist()

# --- 2. Specify target ---
y_col = 'Q7'  # change to your target

# --- 3. Split data ---
X = df[inputs]
y = df[y_col]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 4. Build pipeline with SMOTE + scaling + SVM ---
svm_pipe = Pipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=2)),
    ('scale', StandardScaler()),
    ('svm', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
])

# --- 5. Cross-validate on training set ---
cv_acc = cross_val_score(svm_pipe, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
cv_f1  = cross_val_score(svm_pipe, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)

# --- 6. Fit & predict on test set ---
svm_pipe.fit(X_train, y_train)
y_pred = svm_pipe.predict(X_test)

# --- 7. Compute test metrics ---
test_acc = accuracy_score(y_test, y_pred)
test_f1  = f1_score(y_test, y_pred, average='macro')

# --- 8. Export results ---
res_dir = 'svm_results'
os.makedirs(res_dir, exist_ok=True)

# Performance metrics
metrics_df = pd.DataFrame({
    'Metric': ['CV Accuracy', 'CV F1_macro', 'Test Accuracy', 'Test F1_macro'],
    'Value': [cv_acc.mean(), cv_f1.mean(), test_acc, test_f1],
    'StdDev': [cv_acc.std(), cv_f1.std(), np.nan, np.nan]
})
metrics_df.to_excel(os.path.join(res_dir, 'svm_performance_metrics.xlsx'), index=False)

# Detailed classification report
report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
report_df.to_excel(os.path.join(res_dir, 'svm_classification_report.xlsx'), index=True)

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred, normalize='true')
plt.figure(figsize=(6,5))
sns.heatmap(
    cm, annot=True, fmt='.2f', cmap='Blues',
    xticklabels=svm_pipe.named_steps['svm'].classes_,
    yticklabels=svm_pipe.named_steps['svm'].classes_
)
plt.title(f'Confusion Matrix for {y_col}')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.tight_layout()
plt.savefig(os.path.join(res_dir, 'svm_confusion_matrix.png'), dpi=150)
plt.close()

print(f"SVM + SMOTE results exported to '{res_dir}'")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM + SMOTE results exported to 'svm_results'
