In [56]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [57]:
train_df = pd.read_csv('examples.csv')
test_df = pd.read_csv('test.csv') 

In [58]:
for col in [f"q{i}" for i in range(1, 21)]:
    mean_val = train_df[col].mean()  # Calculate mean
    train_df[col].fillna(round(mean_val), inplace=True)  # Fill NaNs in train data
    test_df[col].fillna(round(mean_val), inplace=True)   # Fill NaNs in test data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(round(mean_val), inplace=True)  # Fill NaNs in train data
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(round(mean_val), inplace=True)   # Fill NaNs in test data


In [59]:
train_df['q21'].fillna("", inplace=True)
train_df['q22'].fillna("", inplace=True)
test_df['q21'].fillna("", inplace=True)
test_df['q22'].fillna("", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['q21'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['q22'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

In [60]:
train_df.head()

Unnamed: 0,Id,q1,q2,q3,q4,q5,q6,q7,q8,q9,...,q14,q15,q16,q17,q18,q19,q20,q21,q22,label
0,bfe3aa97-eb08-43d9-88a5-34ce93509321,6,6.0,1,2,2,2,2,2,1,...,2,2,1,2,1,2,1,"As a surviving child, the weekend seminar with...","""As a surviver chil going to the TAPS Seminar,...",3
1,3b47ee28-e621-4497-949b-0fa0a59044f7,0,4.0,1,2,2,2,0,1,1,...,2,2,0,1,1,2,2,,Como hijo sobreviviente que asistió al Seminar...,3
2,e21808f4-2c7f-4cdf-90f5-daa7ccccc843,0,5.0,2,2,2,2,0,4,2,...,1,2,2,2,2,2,2,,"""The TAPS Seminar was incredibly valuable for ...",4
3,59385c6a-5a11-4348-b0d1-c1f87b56cbd3,5,4.0,0,1,1,1,4,2,1,...,1,3,2,3,3,3,1,,"""I attended the TAPS Seminar recently, and it ...",2
4,6a769ce6-900b-4490-9d67-6a9e8190b78c,0,0.0,1,2,2,2,2,2,1,...,1,2,2,1,1,2,1,"During our unforgettable weekend, we cherishin...",,3


In [61]:
test_df.head()

Unnamed: 0,Id,q1,q2,q3,q4,q5,q6,q7,q8,q9,...,q13,q14,q15,q16,q17,q18,q19,q20,q21,q22
0,a2c83973-8e81-45c9-bb84-37fa8a8b637c,3,4.0,0,1,1,1,4,4,1,...,0,3,1,1,0,0,0,0,,
1,78bca2d4-8824-45ed-80c9-72ef0e4389c6,1,6.0,2,2,2,1,0,0,2,...,2,1,2,2,2,2,2,2,,"""As someone navigating the renewed experience ..."
2,69fc99e5-a555-4b3e-a2e2-6c2a4ce45f36,3,2.0,1,2,2,2,2,2,2,...,2,2,2,2,1,1,2,2,,Attendig the TAPS seminar as a survivin childe...
3,d5d506d9-29bb-404d-9061-3db9ae92d711,4,4.0,0,2,1,2,2,0,0,...,3,2,0,0,2,3,0,2,"""I had a breakthrough at the weekend seminar d...",The TAPS Seminiar give me a unforgittable chan...
4,d0796afc-055d-4ea8-9552-336ad0442de5,1,4.0,0,1,1,0,1,4,1,...,1,3,0,1,1,1,0,0,"""During the weekend seminar with TAPS, my favo...","The TAPS Seminar ws ee-owening, challinging, a..."


In [62]:
print("Missing values in train data:", train_df.isnull().sum().sum())
print("Missing values in test data:", test_df.isnull().sum().sum())

Missing values in train data: 0
Missing values in test data: 0


In [63]:
tfidf_q21 = TfidfVectorizer(max_features=100)
tfidf_q22 = TfidfVectorizer(max_features=100)

q21_train_vectors = tfidf_q21.fit_transform(train_df['q21'].fillna("")).toarray()
q22_train_vectors = tfidf_q22.fit_transform(train_df['q22'].fillna("")).toarray()

q21_test_vectors = tfidf_q21.transform(test_df['q21'].fillna("")).toarray()
q22_test_vectors = tfidf_q22.transform(test_df['q22'].fillna("")).toarray()

# Drop original Q21 and Q22 columns
X_train = train_df.drop(columns=['Id', 'label', 'q21', 'q22']).values
y_train = train_df['label']
X_test = test_df.drop(columns=['Id', 'q21', 'q22']).values

# Concatenate vectorized text features with the rest of the features
X_train = np.hstack((X_train, q21_train_vectors, q22_train_vectors))
X_test = np.hstack((X_test, q21_test_vectors, q22_test_vectors))

In [64]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [65]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=50, max_depth=None, max_features='sqrt', min_samples_split=5, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=42)),
    ('svc', SVC(kernel='linear', probability=True, random_state=42))
]

In [66]:
meta_model = LogisticRegression()

In [73]:
stacking_model = ImbPipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=2)),  # Apply SMOTE oversampling
    ('stack', StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5
    ))
])

In [74]:
stacking_model.fit(X_train_split, y_train_split)

In [75]:
y_val_pred = stacking_model.predict(X_val)
val_f1_score = f1_score(y_val, y_val_pred, average='weighted')
print("Stacked Model Validation F1 Score:", val_f1_score)
print("Stacked Model Classification Report:\n", classification_report(y_val, y_val_pred))

Stacked Model Validation F1 Score: 0.8
Stacked Model Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         1
           2       1.00      0.50      0.67         2
           3       0.50      1.00      0.67         1
           4       1.00      1.00      1.00         3
           5       1.00      0.50      0.67         2
           6       0.50      1.00      0.67         1

    accuracy                           0.80        10
   macro avg       0.83      0.83      0.78        10
weighted avg       0.90      0.80      0.80        10

