In [1]:
import pandas as pd

# Machine Learning
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from lightgbm.sklearn import LGBMClassifier

SEED = 100

train_df = pd.read_parquet("../data/sentence_train.pq")
test_df = pd.read_parquet("../data/sentence_test.pq")
test_ids = pd.read_csv("../data/test.csv").id # TODO: Temp fix to missing ids, will need to include them from processing

X, y = train_df.drop(columns=['target']).to_numpy(), train_df['target'].to_numpy()
X_test = test_df.to_numpy()

In [2]:
lgbm = LGBMClassifier(verbosity=-1, random_state=SEED)
lgbm.fit(X=X, y=y)
lgbm_pred = lgbm.predict(X_test)

pd.DataFrame(
    {
        'id': test_ids,
        'target': lgbm_pred,
    }
).to_csv("../data/predictions/lgbm_pred.csv", index=False)

In [3]:
rf = RandomForestClassifier(random_state=SEED)
rf.fit(X=X, y=y)
rf_pred = rf.predict(X_test)
pd.DataFrame(
    {
        'id': test_ids,
        'target': rf_pred,
    }
).to_csv("../data/predictions/rf_pred.csv", index=False)

In [4]:
svm = SVC(random_state=SEED)
svm.fit(X=X, y=y)
svm_pred = svm.predict(X_test)

pd.DataFrame(
    {
        'id': test_ids,
        'target': svm_pred,
    }
).to_csv("../data/predictions/svm_pred.csv", index=False)

In [5]:
gpc = GaussianProcessClassifier(random_state=SEED)
gpc.fit(X=X, y=y)
gpc_pred = gpc.predict(X_test)

pd.DataFrame(
    {
        'id': test_ids,
        'target': gpc_pred,
    }
).to_csv("../data/predictions/gpc_pred.csv", index=False)