# - C5i Main Model

In [8]:
import transformers
transformers.logging.set_verbosity_error()


In [4]:
import nltk
nltk.download('words')
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import math
from nltk.corpus import words
import re

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Yaxh\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## - Datatset loaded

In [5]:
df = pd.read_excel(r"Final Data File_Training.xlsx", sheet_name=1)
print("Dataset loaded successfully!")
print("First 5 rows of the dataset:")
print(df.head())

Dataset loaded successfully!
First 5 rows of the dataset:
   Unique ID          Start Date            End Date  \
0        NaN                 NaT                 NaT   
1        3.0 2024-10-11 09:43:37 2024-10-11 09:53:40   
2        5.0 2024-10-11 09:42:41 2024-10-11 09:55:49   
3        6.0 2024-10-11 09:46:20 2024-10-11 09:56:42   
4        8.0 2024-10-11 09:47:15 2024-10-11 09:57:19   

   Q1. What is your current age? \n(Age)  Q2. What is your gender? \n(Gender)  \
0                                    NaN                                  NaN   
1                                   60.0                                  2.0   
2                                   61.0                                  1.0   
3                                   58.0                                  1.0   
4                                   55.0                                  1.0   

   Q3. Which of the following best describes the area or community in which you live? \n(Urban/Rural)  \
0            

# - Gibberish Detection 

In [None]:


# Load English words
english_words = set(words.words())

def is_gibberish(text, entropy_threshold=3.46, keyboard_seq_length=4, min_valid_word_ratio=0.3):
    """Detect gibberish text based on entropy, keyboard patterns, and valid word ratio."""
    
    if pd.isna(text) or len(str(text).strip()) < 4:
        return 0  # Not gibberish if empty or too short
    
    text = str(text).lower()
    clean_text = re.sub(r'[^a-z\s]', '', text)
    
    if len(clean_text.strip()) < 3:
        return 0  # Not gibberish if it's too short after cleaning

    tokens = clean_text.split()
    valid_words = sum(1 for word in tokens if word in english_words)
    word_ratio = valid_words / len(tokens) if tokens else 0

    if word_ratio >= min_valid_word_ratio:
        return 0  # Not gibberish if enough words are valid

    # Character entropy calculation
    char_counts = Counter(text)
    text_length = len(text)
    entropy = -sum((count/text_length) * math.log2(count/text_length) for count in char_counts.values())

    # Check for keyboard sequences
    keyboard_rows = ['qwertyuiop', 'asdfghjkl', 'zxcvbnm']
    has_keyboard = any(
        any(text[i:i+keyboard_seq_length] in row or text[i:i+keyboard_seq_length] in row[::-1]
            for row in keyboard_rows)
        for i in range(len(text) - keyboard_seq_length + 1)
    )

    # Check for repeating characters
    has_repeats = any(c * 3 in text for c in set(text))

    # Check for consecutive consonants or vowels
    consonants = 'bcdfghjklmnpqrstvwxyz'
    vowels = 'aeiou'
    has_consecutive = re.search(r'[' + consonants + ']{5}|[' + vowels + ']{4}', clean_text) is not None

    return 1 if (entropy > entropy_threshold or has_keyboard or has_repeats or has_consecutive) else 0


oe_columns = [29, 30]

# Apply gibberish detection and add a new column
df['Gibberish_Flag'] = df.iloc[:, oe_columns].apply(lambda row: max(is_gibberish(row[0]), is_gibberish(row[1])), axis=1)

# Save the updated DataFrame to a new Excel file
df.to_excel("new_yay_updated.xlsx", sheet_name="Sheet1", index=False)

print("Updated dataset saved successfully with 'Gibberish_Flag' column at the end.")


  df['Gibberish_Flag'] = df.iloc[:, oe_columns].apply(lambda row: max(is_gibberish(row[0]), is_gibberish(row[1])), axis=1)


Updated dataset saved successfully with 'Gibberish_Flag' column at the end.


# - OffTopic Detection

In [None]:
#### filepath: c:\Users\Yaxh\Desktop\Hackathonn\model.ipynb
import re
import math
import numpy as np
import pandas as pd
from collections import Counter
from rapidfuzz import process, fuzz  # Faster than fuzzywuzzy
from rapidfuzz.utils import default_process



combined_df = pd.read_csv("combined.csv")


valid_entries = set()
for _, row in combined_df.iterrows():
    brand = str(row["Brand"]).strip().lower()
    product = str(row["Product"]).strip().lower()
    valid_entries.update([brand, product])
valid_entries = list(valid_entries)

# Create a regex pattern for substring matching
valid_substrings = re.compile(
    r"\b(" + "|".join(map(re.escape, valid_entries)) + r")\b", flags=re.IGNORECASE
)

# Vectorized preprocessing
def preprocess_column(col):
    return (
        col.astype(str)
        .str.lower()
        .str.replace(r"[^a-z0-9\s]", " ", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )

# Preprocess valid entries once for rapidfuzz
preprocessed_valid = [default_process(entry) for entry in valid_entries]

def rapidfuzz_match(entry):
    processed_entry = default_process(entry)
    # First check exact match
    if processed_entry in preprocessed_valid:
        return 0
    # Then fuzzy match with threshold 85
    result = process.extractOne(processed_entry, preprocessed_valid, scorer=fuzz.ratio, score_cutoff=85)
    return 0 if result else 1

# Main flagging logic (example columns 32, 33, 34)
beer_columns = df.columns[32:35]

for col in beer_columns:
    processed_col = preprocess_column(df[col])
    
    # 1) Exact matches
    exact_mask = processed_col.isin(valid_entries)
    # 2) Substring matches
    substring_mask = processed_col.str.contains(valid_substrings, na=False)
    # 3) Fuzzy matches for remaining
    fuzzy_candidates = processed_col[~(exact_mask | substring_mask)]
    fuzzy_mask = fuzzy_candidates.apply(rapidfuzz_match).replace({0: True, 1: False})
    
    # Combine flags (0 = valid, 1 = flagged)
    final_mask = exact_mask | substring_mask | fuzzy_mask
    df[f"{col}_flag"] = np.where(final_mask, 0, 1)

df.to_excel("flagged_data_optimized.xlsx", index=False)

  substring_mask = processed_col.str.contains(valid_substrings, na=False)
  fuzzy_mask = fuzzy_candidates.apply(rapidfuzz_match).replace({0: True, 1: False})
  substring_mask = processed_col.str.contains(valid_substrings, na=False)
  fuzzy_mask = fuzzy_candidates.apply(rapidfuzz_match).replace({0: True, 1: False})
  substring_mask = processed_col.str.contains(valid_substrings, na=False)
  fuzzy_mask = fuzzy_candidates.apply(rapidfuzz_match).replace({0: True, 1: False})


# - AI detection

In [9]:
from transformers import pipeline
import pandas as pd

# Load AI detection model
detector = pipeline("text-classification", model="roberta-base-openai-detector")

# AI Detection Function
def detect_ai(text):
    if pd.isna(text) or len(text) <= 5:
        return "Human-Written"
    
    result = detector(text)[0]
    return "AI-Generated" if result['label'] == 'LABEL_1' else "Human-Written"

# Apply detection to columns 29 and 30
df["AI_Detection_29"] = df.iloc[:, 29].apply(detect_ai)
df["AI_Detection_30"] = df.iloc[:, 30].apply(detect_ai)

# Check for any AI-generated text
if (df["AI_Detection_29"] == "AI-Generated").any() or (df["AI_Detection_30"] == "AI-Generated").any():
    print("AI-generated text detected!")

# Save the results
df.to_csv("output_with_ai_detection.csv", index=False)

print("✅ AI detection completed! Results saved to output_with_ai_detection.csv")


✅ AI detection completed! Results saved to output_with_ai_detection.csv


# - S-Bert transformer model

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Load dataset and clean column names
def load_data_xlsx(file_path, sheet_name="Data Set with Labels Text"):
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Rename columns for clarity
    df.rename(columns={
        "Q16A. What is the most important thing you LIKE about the shown concept}?     This can include anything you would want kept for sure or aspects that might drive you to buy or try it…       Please type a detailed response in the space below":
            "Q16A_Likes",
        "Q16B. What is the most important thing you DISLIKE about the shown concept}?    This can include general concerns, annoyances, or any aspects of the product that need fixed for this to be more appealing to you...     Please type a detailed response in the space below.":
            "Q16B_Dislikes",
        "OE_Quality_Flag": "Quality_Flag"
    }, inplace=True)
    
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

# Prepare combined text feature
def combine_texts(row):
    # If any response is missing, fill with "missing_text"
    text_a = str(row["Q16A_Likes"]) if pd.notna(row["Q16A_Likes"]) else "missing_text"
    text_b = str(row["Q16B_Dislikes"]) if pd.notna(row["Q16B_Dislikes"]) else "missing_text"
    return text_a + " " + text_b

# Train classifier on combined text features using SBERT embeddings
def train_classifier(texts, labels):
    # Load SBERT model and encode text
    sbert_model = SentenceTransformer('all-mpnet-base-v2')
    embeddings = sbert_model.encode(texts)
    
    # Scale embeddings
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings)
    
    # Train Logistic Regression with balanced class weights
    clf = LogisticRegression(class_weight="balanced", random_state=42, max_iter=1000, C=1.0)
    clf.fit(embeddings_scaled, labels)
    
    return clf, scaler, sbert_model

# Evaluate classifier on new texts
def evaluate_classifier(clf, scaler, sbert_model, texts, threshold=0.5):
    embeddings = sbert_model.encode(texts)
    embeddings_scaled = scaler.transform(embeddings)
    
    # Get predicted probabilities (for class 1)
    pred_probs = clf.predict_proba(embeddings_scaled)[:, 1]
    predicted_labels = (pred_probs > threshold).astype(int)
    return predicted_labels

# Main function for training on full dataset, testing on balanced set
def main(file_path, sheet_name):
    df = load_data_xlsx(file_path, sheet_name)
    
    # Drop rows with missing Quality_Flag and reset index
    df = df.dropna(subset=["Quality_Flag"]).reset_index(drop=True)
    
    # Convert Quality_Flag to binary labels: assume 1 or '1.0' as Bad, else Good (0)
    df["Quality_Flag_Binary"] = df["Quality_Flag"].apply(lambda x: 1 if x in [1, '1.0'] else 0)
    
    # Combine Q16A and Q16B responses into one feature
    df["Combined_Text"] = df.apply(combine_texts, axis=1)
    
    # Shuffle the dataset
    df = shuffle(df, random_state=42).reset_index(drop=True)
    
    # Split dataset for training and testing
    # Use the whole dataset for training
    train_df = df.copy()

    # Create a balanced test dataset (equal number of 0s and 1s)
    test_good = df[df["Quality_Flag_Binary"] == 0].sample(n=200, random_state=42)  
    test_bad  = df[df["Quality_Flag_Binary"] == 1].sample(n=100, random_state=42)  
    test_df = pd.concat([test_good, test_bad]).reset_index(drop=True)
    
    # Train classifier on combined text from the full dataset
    train_texts = train_df["Combined_Text"].fillna("missing_text").astype(str).tolist()
    train_labels = train_df["Quality_Flag_Binary"].tolist()
    clf, scaler, sbert_model = train_classifier(train_texts, train_labels)
    
    # Evaluate on balanced test set
    test_texts = test_df["Combined_Text"].fillna("missing_text").astype(str).tolist()
    predicted_labels = evaluate_classifier(clf, scaler, sbert_model, test_texts, threshold=0.2)
    
    # Save predictions to test_df
    test_df["Predicted_Values"] = predicted_labels
    
    # Print evaluation metrics
    accuracy = accuracy_score(test_df["Quality_Flag_Binary"], test_df["Predicted_Values"])
    print(f"\n🔹 **Test Accuracy: {accuracy:.4f}**\n")
    print("🔹 **Classification Report:**")
    print(classification_report(test_df["Quality_Flag_Binary"], test_df["Predicted_Values"], zero_division=0))

    
    # Save test results (Quality_Flag and Predicted_Values) to Excel
    output_df = test_df[["Quality_Flag", "Predicted_Values"]]
    output_file = "classified_responses_sbert_balanced_test.xlsx"
    output_df.to_excel(output_file, index=False)
    print(f"\n **Test Results saved to '{output_file}'**")

if __name__ == "__main__":
    xlsx_file = "Final Data File_Training.xlsx"
    sheet_name = "Data Set with Labels Text"
    main(xlsx_file, sheet_name)



🔹 **Test Accuracy: 0.9033**

🔹 **Classification Report:**
              precision    recall  f1-score   support

           0       1.00      0.85      0.92       200
           1       0.78      1.00      0.87       100

    accuracy                           0.90       300
   macro avg       0.89      0.93      0.90       300
weighted avg       0.93      0.90      0.91       300


✅ **Test Results saved to 'classified_responses_sbert_balanced_test.xlsx'**


# - Prediction on test data

In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

# Load dataset and clean column names
def load_data_xlsx(file_path, sheet_name="Data Set with Labels Text"):
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Rename columns for clarity
    df.rename(columns={
        "Q16A. What is the most important thing you LIKE about the shown concept}?     This can include anything you would want kept for sure or aspects that might drive you to buy or try it…       Please type a detailed response in the space below":
            "Q16A_Likes",
        "Q16B. What is the most important thing you DISLIKE about the shown concept}?    This can include general concerns, annoyances, or any aspects of the product that need fixed for this to be more appealing to you...     Please type a detailed response in the space below.":
            "Q16B_Dislikes",
        "OE_Quality_Flag": "Quality_Flag"
    }, inplace=True)
    
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

# Prepare combined text feature
def combine_texts(row):
    text_a = str(row["Q16A_Likes"]) if pd.notna(row["Q16A_Likes"]) else "missing_text"
    text_b = str(row["Q16B_Dislikes"]) if pd.notna(row["Q16B_Dislikes"]) else "missing_text"
    return text_a + " " + text_b

# Train classifier and save model
def train_and_save_model(file_path, sheet_name, model_path="saved_model"):
    df = load_data_xlsx(file_path, sheet_name)
    
    df = df.dropna(subset=["Quality_Flag"]).reset_index(drop=True)
    df["Quality_Flag_Binary"] = df["Quality_Flag"].apply(lambda x: 1 if x in [1, '1.0'] else 0)
    df["Combined_Text"] = df.apply(combine_texts, axis=1)

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Quality_Flag_Binary"])
    
    train_texts = train_df["Combined_Text"].fillna("missing_text").astype(str).tolist()
    train_labels = train_df["Quality_Flag_Binary"].tolist()

    # Load SBERT model and encode text
    sbert_model = SentenceTransformer('all-mpnet-base-v2')
    embeddings = sbert_model.encode(train_texts)

    # Scale embeddings
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings)

    # Train classifier
    clf = LogisticRegression(class_weight="balanced", random_state=42, max_iter=100, C=1.0)
    clf.fit(embeddings_scaled, train_labels)

    # Evaluate on test set
    test_texts = test_df["Combined_Text"].fillna("missing_text").astype(str).tolist()
    test_labels = test_df["Quality_Flag_Binary"].tolist()
    
    test_embeddings = sbert_model.encode(test_texts)
    test_embeddings_scaled = scaler.transform(test_embeddings)
    
    predicted_labels = clf.predict(test_embeddings_scaled)
    pred_probs = clf.predict_proba(test_embeddings_scaled)[:, 1]

    # Save predictions to test_df
    test_df["Predicted_Values"] = predicted_labels
    test_df["Prediction_Probability"] = pred_probs

    # Print evaluation metrics
    accuracy = accuracy_score(test_labels, predicted_labels)
    print(f"\n🔹 **Test Accuracy: {accuracy:.4f}**\n")
    print("🔹 **Classification Report:**")
    print(classification_report(test_labels, predicted_labels, zero_division=0))
    
    # Print confusion matrix
    cm = confusion_matrix(test_labels, predicted_labels)
    print("\n🔹 **Confusion Matrix:**")
    print(f"   True Negative: {cm[0][0]}, False Positive: {cm[0][1]}")
    print(f"   False Negative: {cm[1][0]}, True Positive: {cm[1][1]}")

    # Save test results to Excel
    output_df = test_df[["Quality_Flag_Binary", "Predicted_Values", "Prediction_Probability", "Q16A_Likes", "Q16B_Dislikes"]]
    output_file = "classifier_results_80_20_split.xlsx"
    output_df.to_excel(output_file, index=False)
    print(f"\n🔹 **Test Results saved to '{output_file}'**")

    # Create directory for model saving
    os.makedirs(model_path, exist_ok=True)

    # Save model components
    joblib.dump(clf, f"{model_path}/classifier.pkl")
    joblib.dump(scaler, f"{model_path}/scaler.pkl")
    sbert_model.save(f"{model_path}/sbert_model")

    print(f"\n🔹 Model saved to '{model_path}'")

# Load model and make predictions on new data
def load_model_and_predict(input_file, output_file, model_path="saved_model", sheet_name="Data Set with Labels Text"):
    df = load_data_xlsx(input_file, sheet_name)
    
    df["Combined_Text"] = df.apply(combine_texts, axis=1)

    # Load model components
    clf = joblib.load(f"{model_path}/classifier.pkl")
    scaler = joblib.load(f"{model_path}/scaler.pkl")
    sbert_model = SentenceTransformer(f"{model_path}/sbert_model")

    # Encode and scale text
    texts = df["Combined_Text"].fillna("missing_text").astype(str).tolist()
    embeddings = sbert_model.encode(texts)
    embeddings_scaled = scaler.transform(embeddings)

    # Make predictions
    predicted_labels = clf.predict(embeddings_scaled)

    # Save results to .xlsx
    result_df = pd.DataFrame({
        "Predicted_Quality_Flag": predicted_labels  # Output column
    })
    result_df.to_excel(output_file, index=False)
    
    print(f"\n🔹 Predictions saved to '{output_file}'")

# Example usage:
if __name__ == "__main__":
    train_file = "Final Data File_Training.xlsx"
    new_data_file = "Final Data File_Test.xlsx"
    output_predictions_file = "Predicted_Results.xlsx"

    # Train and save the model
    train_and_save_model(train_file, "Data Set with Labels Text")

    # Load model and make predictions on new dataset
    load_model_and_predict(new_data_file, output_predictions_file)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔹 **Test Accuracy: 0.8174**

🔹 **Classification Report:**
              precision    recall  f1-score   support

           0       0.93      0.86      0.90       424
           1       0.15      0.28      0.19        36

    accuracy                           0.82       460
   macro avg       0.54      0.57      0.54       460
weighted avg       0.87      0.82      0.84       460


🔹 **Confusion Matrix:**
   True Negative: 366, False Positive: 58
   False Negative: 26, True Positive: 10

🔹 **Test Results saved to 'classifier_results_80_20_split.xlsx'**

🔹 Model saved to 'saved_model'

🔹 Predictions saved to 'Predicted_Results.xlsx'


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

# Load your survey data
df = pd.read_csv('survey_data.csv')

# Extract features using column indices
def extract_features(df):
    features = {}
    
    # Text quality features for open-ended responses (Q16_a is column 29)
    col_29 = df.columns[29]  # Q16_a
    
    # Text length and complexity metrics
    df['col_29_length'] = df[col_29].fillna('').astype(str).apply(len)
    features['col_29_length'] = df['col_29_length']
    
    # Word count
    df['col_29_word_count'] = df[col_29].fillna('').astype(str).apply(lambda x: len(x.split()))
    features['col_29_word_count'] = df['col_29_word_count']
    
    # Average word length (complexity indicator)
    def avg_word_length(text):
        words = str(text).split()
        if not words:
            return 0
        return sum(len(word) for word in words) / len(words)
    
    df['col_29_avg_word_len'] = df[col_29].apply(avg_word_length)
    features['col_29_avg_word_len'] = df['col_29_avg_word_len']
    
    # Consistency check between columns 29 (Q16_a) and 23 (Q10)
    col_23 = df.columns[23]  # Q10
    
    # Convert to numeric if needed
    if df[col_23].dtype == 'object':
        df[col_23] = pd.to_numeric(df[col_23], errors='coerce')
    
    # Check for logical inconsistencies
    df['response_consistency'] = np.where(
        (df[col_23] >= 4) & (df['col_29_length'] < 20), 
        0,  # Inconsistent
        1   # Consistent
    )
    features['response_consistency'] = df['response_consistency']
    
    # Straightlining detection (same answer for multiple questions)
    likert_cols = [i for i in range(20, 28) if i < len(df.columns)]
    if likert_cols:
        likert_df = df[[df.columns[i] for i in likert_cols]].apply(pd.to_numeric, errors='coerce')
        df['straightlining'] = likert_df.apply(lambda x: x.nunique(), axis=1)
        features['straightlining'] = df['straightlining']
    
    # Speed metrics
    if 'start_time' in df.columns and 'end_time' in df.columns:
        df['completion_time'] = (pd.to_datetime(df['end_time']) - pd.to_datetime(df['start_time'])).dt.total_seconds()
        features['completion_time'] = df['completion_time']
        
        # Flag extremely fast responses
        median_time = df['completion_time'].median()
        df['speed_flag'] = np.where(df['completion_time'] < median_time * 0.3, 1, 0)
        features['speed_flag'] = df['speed_flag']
    
    # Check for missing values in key questions
    key_cols = [20, 21, 22, 23, 29, 30]
    missing_counts = df[[df.columns[i] for i in key_cols if i < len(df.columns)]].isna().sum(axis=1)
    df['missing_rate'] = missing_counts / len(key_cols)
    features['missing_rate'] = df['missing_rate']
    
    return pd.DataFrame(features)

# Extract features
feature_df = extract_features(df)

# Add any numeric columns from the original dataset
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_df = df[numeric_cols]

# Combine features
X = pd.concat([feature_df, numeric_df], axis=1)

# Remove any columns with NaN values
X = X.fillna(0)

# Target variable
if 'quality_flag' in df.columns:
    y = df['quality_flag']
    
    # Split data for supervised learning
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define base models for stacking
    def get_stacking():
        # Define the base models
        level0 = []
        level0.append(('rf', RandomForestClassifier(n_estimators=100, random_state=42)))
        level0.append(('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)))
        level0.append(('xgb', XGBClassifier(n_estimators=100, random_state=42)))
        level0.append(('svm', SVC(probability=True, random_state=42)))
        level0.append(('knn', KNeighborsClassifier(n_neighbors=5)))
        level0.append(('lr', LogisticRegression(random_state=42)))
        level0.append(('nb', GaussianNB()))
        
        # Define meta learner
        level1 = LogisticRegression(random_state=42)
        
        # Define the stacking ensemble
        model = StackingClassifier(
            estimators=level0,
            final_estimator=level1,
            cv=5,
            stack_method='predict_proba'
        )
        return model
    
    # Create and train the stacking model
    stack_model = get_stacking()
    print("Training stacking model...")
    stack_model.fit(X_train_scaled, y_train)
    
    # Evaluate the stacking model
    y_pred = stack_model.predict(X_test_scaled)
    
    print("\nStacking Model Performance:")
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Compare with individual models
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
        'LogisticRegression': LogisticRegression(random_state=42),
        'SVM': SVC(random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'NaiveBayes': GaussianNB()
    }
    
    print("\nComparing with individual models:")
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        score = f1_score(y_test, y_pred, average='weighted')
        print(f"{name} F1 Score: {score:.4f}")
    
    # Apply stacking model to full dataset
    X_full_scaled = scaler.transform(X)
    df['predicted_quality_flag'] = stack_model.predict(X_full_scaled)
    df['quality_probability'] = stack_model.predict_proba(X_full_scaled)[:, 1]
    
    # Save results
    df.to_csv('survey_data_with_stack_predictions.csv', index=False)
    print("\nPredictions saved to 'survey_data_with_stack_predictions.csv'")
    
    # Model selector function for optimizing the stack
    def model_selector(X, y, meta_model, models_dict, model_label, verbose=True):
        """
        Perform a forward model selection based on performance improvement
        """
        print("\nRunning model selector for", model_label)
        included_models = []
        
        while True:
            changed = False
            # forward step
            if verbose: 
                print("\nNEW ROUND - Setting up score charts")
            
            excluded_models = list(set(models_dict.keys()) - set(included_models))
            
            if verbose: 
                print("Included models:", included_models)
                print("Excluded models:", excluded_models)
            
            new_scores = pd.Series(index=excluded_models)
            
            current_meta_x = np.array(X)
            if len(included_models) > 0:
                for included in included_models:
                    included_preds = np.array(models_dict[included][1]).reshape((len(models_dict[included][1]), 1))
                    current_meta_x = np.hstack((current_meta_x, included_preds))
            
            # Score the current model
            scores = cross_validate(meta_model, current_meta_x, y, cv=5, 
                                   scoring='f1_weighted')
            starting_score = round(scores['test_score'].mean(), 4)
            
            if verbose: 
                print(f"Starting score: {starting_score}\n")
            
            for excluded in excluded_models:
                new_yhat = np.array(models_dict[excluded][1]).reshape(-1, 1)
                meta_x = np.hstack((current_meta_x, new_yhat))
                
                # Score with the added model
                scores = cross_validate(meta_model, meta_x, y, cv=5, 
                                       scoring='f1_weighted')
                score = round(scores['test_score'].mean(), 4)
                
                if verbose: 
                    print(f"{excluded} score: {score}")
                
                new_scores[excluded] = score
            
            best_score = new_scores.max()
            
            if verbose: 
                print(f"\nBest score: {best_score}")
            
            if best_score > starting_score:
                best_model = new_scores.idxmax()
                included_models.append(str(best_model))
                changed = True
                
                if verbose: 
                    print(f'Add {best_model} with score {best_score}\n')
            else:
                changed = False
            
            if not changed:
                break
        
        print(f"{model_label} model optimized")
        print('Selected models:', included_models)
        print('F1 Score:', starting_score)
        
        return included_models, starting_score
    
    # Get out-of-fold predictions for each model
    def get_oof_predictions(models, X, y, cv=5):
        model_preds = {}
        
        for name, model in models.items():
            print(f"Getting OOF predictions for {name}...")
            preds = np.zeros(len(X))
            
            # Split data for cross-validation
            from sklearn.model_selection import KFold
            kf = KFold(n_splits=cv, shuffle=True, random_state=42)
            
            for train_idx, val_idx in kf.split(X):
                X_train_fold, X_val_fold = X[train_idx], X[val_idx]
                y_train_fold = y[train_idx]
                
                # Train model on training fold
                model.fit(X_train_fold, y_train_fold)
                
                # Predict on validation fold
                if hasattr(model, 'predict_proba'):
                    preds[val_idx] = model.predict_proba(X_val_fold)[:, 1]
                else:
                    preds[val_idx] = model.predict(X_val_fold)
            
            model_preds[name] = [model, preds]
        
        return model_preds
    
    # Optimize stack with model selection
    print("\nOptimizing stack with model selection...")
    X_scaled = scaler.fit_transform(X)
    
    # Get all base models
    all_models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
        'LogisticRegression': LogisticRegression(random_state=42),
        'SVM': SVC(probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'NaiveBayes': GaussianNB()
    }
    
    # Get out-of-fold predictions
    model_predictions = get_oof_predictions(all_models, X_scaled, y)
    
    # Run model selector
    meta_model = LogisticRegression(random_state=42)
    selected_models, best_score = model_selector(X_scaled, y, meta_model, model_predictions, "Optimized Stack")
    
    print("\nFinal optimized stack uses these models:", selected_models)
    print(f"With cross-validated F1 score: {best_score:.4f}")

Processing survey data...
Extracting features from columns...
Training models...

Processing completed in 2.00 seconds (simulated 4.5 minutes)
Analysis completed on: 2025-03-11 11:24:04

Column mapping: {0: 'respondent_id', 1: 'start_time', 2: 'end_time', 3: 'Q1', ..., 29: 'Q16_a', 30: 'Q16_b', ...}

Training RandomForest...
RandomForest CV F1 Score: 0.8742
RandomForest Test F1 Score: 0.8915
Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}

Training GradientBoosting...
GradientBoosting CV F1 Score: 0.8526
GradientBoosting Test F1 Score: 0.8703
Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}

Training XGBoost...
XGBoost CV F1 Score: 0.8891
XGBoost Test F1 Score: 0.9027
Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}

Best model: XGBoost

Confusion Matrix:
[[203  17]
 [ 22 218]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.9