# Weak Supervision with Label Studio

### Connect to Label Studio

Create Label Studio client connecting to Community or Enterprise version

In [1]:
from label_studio_sdk import Client

ls = Client(url='http://localhost:8000', api_key='d6f8a2622d39e9d89ff0dfef1a80ad877f4ee9e3')
ls.check_connection()

{'status': 'UP'}

### Create new project

Create a simple Text Classification project for sentiment analysis

In [2]:
project = ls.start_project(
    title='Weak Supervision example with SDK',
    label_config='''
    <View>
    <Text name="text" value="$text"/>
    <View style="box-shadow: 2px 2px 5px #999; padding: 20px; margin-top: 2em; border-radius: 5px;">
        <Header value="Choose text sentiment"/>
        <Choices name="sentiment" toName="text" choice="single" showInLine="true">
            <Choice value="Positive"/>
            <Choice value="Negative"/>
            <Choice value="Neutral"/>
        </Choices>
    </View>
    </View>
    '''
)

### Import tasks
Import small texts, and gets their task IDs

In [3]:
import pandas as pd

tasks = pd.read_csv('amazon_cells_labelled.tsv', sep='\t').to_dict('records')
tasks_ids = project.import_tasks(tasks)

### Create noisy predictions
Perform programmatic labeling for creating weakly supervised annotations

In [4]:
import re, random

# Noisy programmatic labelers
label_ops = {
    r'.*\b(good|excellent|great|cool)': 'Positive',
    r'.*\bi\s+like': 'Positive',
    r'.*\bnot': 'Negative',
    r'.*\bdisappointed': 'Negative',
    r'.*\bjunk': 'Negative'
}

# Preannotations in Label Studio JSON format
predictions = []
for label_regex, label in label_ops.items():
    model_version = label_regex
    for task, task_id in zip(tasks, tasks_ids):
        text = task['text'].lower()
        if re.match(label_regex, text):
            predictions.append({
                'task': task_id,
                'result': [{
                    'from_name': 'sentiment',
                    'to_name': 'text',
                    'type': 'choices',
                    'value': {
                        'choices': [label]
                    }
                }],
                'score': random.random(),
                'model_version': model_version
            })

project.import_predictions(predictions)

{}

### Check quality of different models
For each programmatic labeler used, we can get different associated stats, like dataset coverage, conflict and ground truth match

In [6]:
model_versions = project.get_model_versions()

# check model version stats
pd.Series(project.get_predictions_coverage(), name='Coverage')
# project.predictions_agreement(model_versions)
# then select specific model versions

.*\bdisappointed                   0.010
.*\b(good|excellent|great|cool)    0.202
.*\bi\s+like                       0.004
.*\bjunk                           0.008
.*\bnot                            0.120
                                   0.000
Name: Coverage, dtype: float64

### Create annotations from specific model versions
Based on quality metrics from previous steps, we can select subset of labelers and "merge" corresponding preannotations to a unique task annotation

In [9]:
print(project.create_annotations_from_predictions(model_versions=list(model_versions)))

{'detail': 'Created 344 annotations'}
