# Classification: Supervised Machine Learning
## Preparation
### Imports

In [1]:
# general imports
import polars as pl
from src import data

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier

# imbalanced-learn
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

### Loading the Data

In [2]:
# where the data is stored
data_directory = '../../../data/datasets/04_preprocessed'

# load the data
datasets = data.dict_from_directory(data_directory, type='polars')

### Helper Functions

In [3]:
def create_train_test_sets(df):
    """
    Return a train-test split of the data

    Args:
        df: Polars DataFrame with columns 'title', 'abstract', and 'include'.

    Returns:
        X_train: list of strings, training data.
        X_test: list of strings, test data.
        y_train: list of booleans, training labels.
        y_test: list of booleans, test labels.
    """
    # combine title and abstract into one column
    combined = df.select(
        pl.concat_str(
            pl.col('title').fill_null(''),
            pl.col('abstract').fill_null(''),
            separator=' ',
        ).alias('text')
    )
    # the vectorizer does not accept null values
    combined = combined.fill_null('')

    # features
    X = combined.to_series().to_list()

    # target
    y = df['include'].to_list()

    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=0.3, 
        random_state=42,  # important for reproducibility
        stratify=y # important for imbalanced classes
    ) 

    return X_train, X_test, y_train, y_test

In [4]:
def create_pipeline(estimator):
    """
    Return a pipeline ready encapuslating a given estimator.
    The pipeline includes Tf-idf vectorization and random undersampling.

    Args:
        estimator: scikit-learn estimator.
    
    Returns:
        pipeline: imbalanced-learn pipeline.
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('undersampling', RandomUnderSampler(
            sampling_strategy='auto',
            random_state=42
            )
        ),
        ('estimator', estimator)
    ])

## Predictions
### Defining Estimators
Article classes will be predicted by the following four estimators:

In [5]:
# dictionary of estimators to predict with
# estimators use class_weights as available
estimators = {
    'logistic_regression': LogisticRegression(class_weight='balanced'),
    'random_forest': RandomForestClassifier(class_weight='balanced'),
    'support_vector_machine': SVC(class_weight='balanced'),
    'naive_bayes': ComplementNB(),
}

## Predicting Article Classes
Predict classes for each dataset and estimator each. Save the predictions besides the true values of the test set:

In [6]:
from tqdm.notebook import tqdm

predictions = {}

for subject, dataset in tqdm(
    iterable=datasets.items(),
    desc='Datasets',
    total=len(datasets),
    leave=True
):

    X_train, X_test, y_train, y_test = create_train_test_sets(dataset)

    predictions[subject] = pl.DataFrame(
        data=pl.Series(
            name='true',
            values=y_test
        )
    )

    for name, estimator in tqdm(
        iterable=estimators.items(),
        desc='Estimators',
        total=len(estimators),
        leave=False,
    ):
        
        # pipeline
        pipeline = create_pipeline(estimator)

        # fit
        pipeline.fit(X_train, y_train)

        # predict
        y_pred = pipeline.predict(X_test)

        # store predictions
        predictions[subject] = predictions[subject].with_columns(
            pl.Series(name=name, values=y_pred)
        )

Datasets:   0%|          | 0/6 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

## Export

In [7]:
# where to save the predictions
export_path = '../../../data/predictions/supervised_machine_learning'

for subject, df in predictions.items():
    df.write_csv(f'{export_path}/{subject}_pred.csv')