In [1]:
import pandas as pd

df = pd.read_csv('/content/sample_data/final_train.csv')

from sklearn.model_selection import train_test_split

train_set, test_set_from_train = train_test_split(df, test_size=0.2, random_state=42)

# split target and features
X_train = train_set.drop('Depression', axis=1)
y_train = train_set['Depression'].copy()

In [2]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [3]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=1000, max_depth=14, n_jobs=-1, random_state=42)),
        ('nonlinear_svc', SVC(kernel='poly', degree=3, C=1, random_state=42, probability=True)),
        ('Ada', AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=30, learning_rate=0.5, random_state=42)),
        ('gbf', GradientBoostingClassifier(max_depth=3, n_estimators=30, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=42),
    cv=5
)

In [4]:
stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_train)



In [5]:
from sklearn.metrics import accuracy_score
cross_val_score(stacking_clf, X_train, y_train, cv=3, scoring='accuracy')



array([0.9347548 , 0.93787313, 0.93502132])

In [6]:
# split target and features
X_test = test_set_from_train.drop('Depression', axis=1)
y_test = test_set_from_train['Depression'].copy()
X_test = scaler.transform(X_test)

In [7]:
y_pred = stacking_clf.predict(X_test)
accuracy_score(y_train, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [112560, 28140]

In [None]:
# save final prediction
test_set = pd.read_csv('/content/sample_data/final_test.csv')
test_set = scaler.transform(test_set)
y_pred = stacking_clf.predict(test_set)
original_test_set = pd.read_csv('/content/sample_data/test.csv')
id = original_test_set['id']
final_prediction = pd.DataFrame({'id': id, 'Depression': y_pred})
final_prediction.to_csv('stacking_clf_final_prediction.csv', index=False)