# Classification: Supervised Machine Learning
This notebook contains the code to classify the articles within each dataset using supervised machine learning models.
## Preparation
### Imports

In [24]:
# general imports
import polars as pl
from src import data

# scikit-learn - machine learning library
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier

# imbalanced-learn - library for dealing with imbalanced datasets
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

# loading bars
from tqdm.notebook import tqdm

### Loading the Data

In [16]:
# where the data is stored
data_directory = '../../../data/datasets/04_preprocessed'

# load the data
datasets = data.dict_from_directory(data_directory, type='polars')

### Helper Functions

In [18]:
def create_train_test_sets(df):
    """
    Return a train-test split of the data

    Args:
        df: Polars DataFrame with columns 'title', 'abstract', and 'include'.

    Returns:
        X_train: list of strings, training data.
        X_test: list of strings, test data.
        y_train: list of booleans, training labels.
        y_test: list of booleans, test labels.
    """
    # combine title and abstract into one column
    # fill null values with empty strings to avoid errors
    combined = df.select(
        pl.col('index'),
        pl.concat_str(
            pl.col('title').fill_null(''),
            pl.col('abstract').fill_null(''),
            separator=' ',
        ).alias('text')
    )

    # features
    #X = combined.to_series().to_numpy()
    X = combined.to_pandas()
    X.set_index('index', inplace=True)

    # target
    #y = df['include'].to_numpy()
    y = df['include'].to_pandas()
    y = y.set_axis(X.index)


    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=0.3, # the test set will be 30% of the data
        random_state=42,  # important for reproducibility
        stratify=y # important for imbalanced classes
    ) 

    # get the indices of the test set
    test_indices = X_test.index

    return X_train, X_test, y_train, y_test, test_indices

In [22]:
def create_pipeline(estimator):
    """
    Return a pipeline ready encapuslating a given estimator.
    The pipeline includes Tf-idf vectorization and random undersampling.

    Args:
        estimator: scikit-learn estimator.
    
    Returns:
        pipeline: imbalanced-learn pipeline.
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('undersampling', RandomUnderSampler(
            sampling_strategy='auto',
            random_state=42
            )
        ),
        ('estimator', estimator)
    ])

## Predictions
### Defining Estimators
Article classes will be predicted by the following four estimators:

In [23]:
# dictionary of estimators to predict with
# class weights will be used for estimators which support it
estimators = {
    'logistic_regression': LogisticRegression(class_weight='balanced'),
    'random_forest': RandomForestClassifier(class_weight='balanced'),
    'support_vector_machine': SVC(class_weight='balanced'),
    'naive_bayes': ComplementNB(),
}

## Predicting Article Classes
Predict classes for each dataset and estimator each. Save the predictions besides the true values of the test set:

In [None]:
# dictionary to store predictions for each dataset and estimator
predictions = {}

# iterate over datasets
for subject, dataset in tqdm(
    iterable=datasets.items(),
    desc='Datasets',
    total=len(datasets),
    leave=True
):

    # use the same train-test split for all estimators
    X_train, X_test, y_train, y_test, test_indices = create_train_test_sets(
        dataset
    )

    # dataframe to store true values and predictions per estimator
    predictions[subject] = pl.DataFrame(
        {
            'index': test_indices,
            'true': y_test
        }
    )

    # convert to numpy arrays
    X_train = X_train['text'].to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test['text'].to_numpy()

    # iterate over estimators
    for name, estimator in tqdm(
        iterable=estimators.items(),
        desc='Estimators',
        total=len(estimators),
        leave=False,
    ):
        
        # create a prediction pipeline
        pipeline = create_pipeline(estimator)

        # fit the estimator
        pipeline.fit(X_train, y_train)

        # predict class labels
        y_pred = pipeline.predict(X_test)

        # store the predictions
        predictions[subject] = predictions[subject].with_columns(
            pl.Series(name=name, values=y_pred)
        )

Datasets:   0%|          | 0/6 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

Estimators:   0%|          | 0/4 [00:00<?, ?it/s]

## Export

In [105]:
# where to save the predictions
export_path = '../../../data/predictions/supervised_machine_learning'

for subject, df in predictions.items():
    df.write_csv(f'{export_path}/{subject}_pred.csv')