# Text Classification

This notebook is for specialized text classification, including some using deep learning via TorchSharp.

This is kept in its own notebook because it uses CUDA 12 and Linux

## Dependencies

In [1]:
#r "nuget:Microsoft.Data.Analysis"
#r "nuget:Microsoft.ML"
#r "nuget:Microsoft.ML.AutoML"
#r "nuget:Microsoft.ML.TorchSharp"
#r "nuget:Newtonsoft.Json"
#r "nuget:Plotly.NET"
#r "nuget:Plotly.NET.Interactive"
#r "nuget:libtorch-cpu-linux-x64, 2.1.0.1"
#r "nuget:TorchSharp,0.101.5"

using Microsoft.DotNet.Interactive.Formatting;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.AutoML;
using Microsoft.ML.AutoML.CodeGen;
using Microsoft.ML.SearchSpace;
using Microsoft.ML.SearchSpace.Option;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.Text;
using Microsoft.ML.TorchSharp;
using Microsoft.ML.TorchSharp.NasBert;
using TorchSharp;
using Newtonsoft.Json;
using System.Reflection;
using System.IO;

Loading extensions from `/home/matteland/.nuget/packages/plotly.net.interactive/5.0.0/lib/netstandard2.1/Plotly.NET.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/microsoft.data.analysis/0.21.1/interactive-extensions/dotnet/Microsoft.Data.Analysis.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/microsoft.ml.automl/0.21.1/interactive-extensions/dotnet/Microsoft.ML.AutoML.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/skiasharp/2.88.6/interactive-extensions/dotnet/SkiaSharp.DotNet.Interactive.dll`

In [2]:
//#r "nuget:MattEland.ML"
//#r "nuget:MattEland.ML.Charts"
//#r "nuget:MattEland.ML.DataFrames"
//#r "nuget:MattEland.ML.Interactive"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML/bin/Debug/net8.0/MattEland.ML.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.DataFrames/bin/Debug/net8.0/MattEland.ML.DataFrames.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.Charts/bin/Debug/net8.0/MattEland.ML.Charts.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.Interactive/bin/Debug/net8.0/MattEland.ML.Interactive.dll"

using MattEland.ML;
using MattEland.ML.Charts;
using MattEland.ML.DataFrames;
using MattEland.ML.Interactive;

await MattEland.ML.Interactive.InteractiveExtensions.Load(Microsoft.DotNet.Interactive.KernelInvocationContext.Current.HandlingKernel.RootKernel);

## Data Loading

In [3]:
DataFrame df = DataFrame.LoadCsv("data/Training.csv", separator: ',', header: true);
df.Columns.Remove("PredictedLabel", "Reasoning", "AuthorId", "AuthorDateUtc", "CommitterId", "CommitterDateUtc", "ParentSha", "Parent2Sha", "DayOfWeek", "Month", "Quarter", "Year", "Hour", "TimeOfDay", "IsWeekend", "Sha", "Source");
df["ActualLabel"].SetName("Label");
df.Sample(5)

index,Label,Message,IsMerge,WorkItems,TotalFiles,ModifiedFiles,AddedFiles,DeletedFiles,TotalLines,NetLines,AddedLines,DeletedLines,HasAddedFiles,HasDeletedFiles,MessageLength,WordCount
0,False,Update README.md,False,0,1,1,0,0,106,2,2,0,False,False,16,2
1,False,Update readme.md,False,0,1,1,0,0,47,0,1,1,False,False,16,2
2,False,Add debug asserts (#1566),False,1,2,2,0,0,3747,57,57,0,False,False,25,4
3,False,update typescript code for message contract changes,False,0,2,2,0,0,662,1,2,1,False,False,51,7
4,False,Update dependencies from https://github.com/dotnet/arcade build 20200224.3,False,0,2,2,0,0,22,0,3,3,False,False,74,6


In [4]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};

// We'll be using cross-validation for the actual training process, but we'll keep a 10% validation set for final evaluation
int seed = 42;
var split = context.Data.TrainTestSplit(df, testFraction: 0.1, seed: seed);

In [5]:

var colTypes = df.GetColumnTypes(excludedColumns: new[] { "Label" });
colTypes

Unnamed: 0,Unnamed: 1
Text,[ Message ]
Numeric,"[ WorkItems, TotalFiles, ModifiedFiles, AddedFiles, DeletedFiles, TotalLines, NetLines, AddedLines, DeletedLines, MessageLength, WordCount ]"
Categorical,"[ IsMerge, HasAddedFiles, HasDeletedFiles ]"
Excluded,[ Label ]


In [6]:
// Create a custom model tracker to record the various experiments we run
BinaryClassificationModelTracker modelTracker = new();

// Although the metric we probably care the most about is the Precision, we're going to focus on F1 Score during model training in order to encourage discovering the most balanced models between precision and recall
modelTracker.DefaultMetric = BinaryClassificationMetric.F1Score;
modelTracker

Unnamed: 0,Unnamed: 1
DefaultMetric,F1Score
Count,0


## Simple Featurizer

In [7]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};
var featurizer = context.Auto().Featurizer(df);

// The classifier step tells AutoML what model trainers are enabled. We'll focus on those that don't require scaled data for simplicity at the moment
var classifier = context.Auto().BinaryClassification(
    useFastForest: true, 
    useLgbm: true, 
    useFastTree: true, 
    useLbfgsLogisticRegression: false, 
    useSdcaLogisticRegression: false);

// Now let's run our experiment using our custom pipeline
var experiment = context.Auto().CreateExperiment()
    .SetPipeline(featurizer.Append(classifier))
    .SetDataset(split.TrainSet, fold: 5) // Cross validation on the training split
    .SetBinaryClassificationMetric(BinaryClassificationMetric.F1Score, labelColumn: "Label")
    .SetMaxModelToExplore(10);

TrialResult result = await experiment.RunAsync();
Console.WriteLine($"F1 Score during training: {result.Metric}");

// Generate metrics using our validation set
ITransformer model = result.Model;
var evalResults = context.BinaryClassification.EvaluateNonCalibrated(model.Transform(split.TestSet), labelColumnName: "Label");

// Let's see how it performed
MLCharts.ClassificationReport(evalResults)

F1 Score during training: 0.5893818991011038


In [8]:
#!transformer-vis model -d 1 -n

model

In [9]:
var enumTransformer = ((IEnumerable<Microsoft.ML.ITransformer>) model);
var textTransformer = enumTransformer.ToList()[2]; 
#!transformer-vis textTransformer -d 2

In [10]:
var parameter = result.TrialSettings.Parameter;

foreach (var key in parameter.Keys.Where(k => k[0] != '_')) {
    Console.WriteLine($"{key}: {parameter[key]}");
}

e0: {"OutputColumnNames":["WorkItems","TotalFiles","ModifiedFiles","AddedFiles","DeletedFiles","TotalLines","NetLines","AddedLines","DeletedLines","MessageLength","WordCount"],"InputColumnNames":["WorkItems","TotalFiles","ModifiedFiles","AddedFiles","DeletedFiles","TotalLines","NetLines","AddedLines","DeletedLines","MessageLength","WordCount"]}
e1: {"OutputColumnNames":["IsMerge","HasAddedFiles","HasDeletedFiles"],"InputColumnNames":["IsMerge","HasAddedFiles","HasDeletedFiles"],"TargetType":"Single"}
e2: {"InputColumnName":"Message","OutputColumnName":"Message"}
e3: {"InputColumnNames":["Message","WorkItems","TotalFiles","ModifiedFiles","AddedFiles","DeletedFiles","TotalLines","NetLines","AddedLines","DeletedLines","MessageLength","WordCount","IsMerge","HasAddedFiles","HasDeletedFiles"],"OutputColumnName":"Features"}
e4: {"NumberOfLeaves":4,"MinimumExampleCountPerLeaf":20,"NumberOfTrees":4,"MaximumBinCountPerFeature":255,"FeatureFraction":1,"LearningRate":0.09999999999999998,"LabelColu

In [11]:
// Save the model
context.Model.Save(model, ((IDataView)df).Schema, $"models/TextFeaturizerAuto.zip");

// Record the model
modelTracker.Register("TextFeaturizerAuto", evalResults).ToDataFrame()

index,Model,F1 Score,Accuracy,Positive Precision,Positive Recall,Negative Precision,Negative Recall,AUC,AUCPR
0,TextFeaturizerAuto,0.6399999999999999,0.7857142857142857,1,0.4705882352941176,0.7352941176470589,1,0.76,0.7719278760720389


## Custom Pipeline

The prior model did fine, but we'd like more control over the text transformation, so we'll provide our own complex pipeline for text processing.

In [12]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};
ContextMonitor contextMonitor = context.Monitor();

// Standardize our numeric colums via scaling and imputing missing values
MissingValueReplacingEstimator imputer = context.Transforms.ReplaceMissingValues(columns: colTypes.Numeric.Select(c => new InputOutputColumnPair(c, c)).ToArray(), replacementMode: MissingValueReplacingEstimator.ReplacementMode.DefaultValue);
NormalizingEstimator scaler = context.Transforms.NormalizeRobustScaling(columns: colTypes.Numeric.Select(c => new InputOutputColumnPair(c, c)).ToArray());

// Standardize our boolean columns as singles
TypeConvertingEstimator boolConverter = context.Transforms.Conversion.ConvertType(columns: colTypes.Categorical.Select(c => new InputOutputColumnPair(c, c)).ToArray(), outputKind: DataKind.Single);
    
// Text pre-processing
TextNormalizingEstimator textNormalizer = context.Transforms.Text.NormalizeText(inputColumnName: "Message", outputColumnName: "Message", caseMode: TextNormalizingEstimator.CaseMode.Lower, keepDiacritics: false, keepPunctuations: false, keepNumbers: false);

// Word trigrams / bigrams / unigrams
WordTokenizingEstimator wordTokenizer = context.Transforms.Text.TokenizeIntoWords(inputColumnName: "Message", outputColumnName: "MessageWords");
StopWordsRemovingEstimator stopRemover = context.Transforms.Text.RemoveDefaultStopWords(inputColumnName: "MessageWords", outputColumnName: "MessageWords", language: StopWordsRemovingEstimator.Language.English);
ValueToKeyMappingEstimator labelConverter = context.Transforms.Conversion.MapValueToKey(inputColumnName: "MessageWords", outputColumnName: "MessageWords");
NgramExtractingEstimator ngramExtractor = context.Transforms.Text.ProduceNgrams(inputColumnName: "MessageWords", outputColumnName: "MessageWords", ngramLength: 3, useAllLengths: true, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf);
LpNormNormalizingEstimator wordNorm = context.Transforms.NormalizeLpNorm(inputColumnName: "MessageWords", outputColumnName: "MessageWords", norm: LpNormNormalizingEstimator.NormFunction.L2);

// Character ngrams
TokenizingByCharactersEstimator charTokenizer = context.Transforms.Text.TokenizeIntoCharactersAsKeys(inputColumnName: "Message", outputColumnName: "MessageChars");
NgramExtractingEstimator charNgram = context.Transforms.Text.ProduceNgrams(inputColumnName: "MessageChars", outputColumnName: "MessageChars", ngramLength: 3, useAllLengths: true, skipLength: 1);
LpNormNormalizingEstimator charNorm = context.Transforms.NormalizeLpNorm(inputColumnName: "MessageChars", outputColumnName: "MessageChars", norm: LpNormNormalizingEstimator.NormFunction.L2);

// We'll concatenate the word and Ngram features together, along with all of our numeric and boolean columns
ColumnConcatenatingEstimator concat = context.Transforms.Concatenate("Features", inputColumnNames: colTypes.Numeric.Concat(colTypes.Categorical).Concat(new[] { "MessageWords", "MessageChars"}).ToArray());

// Since we have scaling in place, let's use all available classifiers
var classifier = context.Auto().BinaryClassification(
    useFastForest: true, 
    useLgbm: true, 
    useFastTree: true, 
    useLbfgsLogisticRegression: true, 
    useSdcaLogisticRegression: true);

// Build a pipeline
SweepablePipeline pipeline = imputer
    .Append(boolConverter)
    .Append(scaler)
    .Append(textNormalizer)
    .Append(wordTokenizer)
    .Append(stopRemover)
    .Append(labelConverter)
    .Append(ngramExtractor)
    .Append(wordNorm)
    .Append(charTokenizer)
    .Append(charNgram)
    .Append(charNorm)
    .Append(concat)
    .Append(classifier);

// Now let's run our experiment using our custom pipeline
var experiment = context.Auto().CreateExperiment()
    .SetPipeline(pipeline)
    .SetDataset(split.TrainSet, fold: 5) // Cross-validation using 90% of the data
    .SetBinaryClassificationMetric(BinaryClassificationMetric.F1Score, labelColumn: "Label")
    .SetMaxModelToExplore(10);

var result = await experiment.RunAsync();

MLCharts.MetricImprovementWithTrials(contextMonitor)

In [13]:
ITransformer model = result.Model;
#!transformer-vis model -d 2

model

In [14]:
var parameter = result.TrialSettings.Parameter;

foreach (var key in parameter.Keys.Where(k => k[0] != '_')) {
    Console.WriteLine($"{key}: {parameter[key]}");
}

e0: {}
e1: {"NumberOfLeaves":4,"MinimumExampleCountPerLeaf":20,"NumberOfTrees":4,"MaximumBinCountPerFeature":255,"FeatureFraction":1,"LearningRate":0.09999999999999998,"LabelColumnName":"Label","FeatureColumnName":"Features","DiskTranspose":false}
e2: {"NumberOfTrees":4,"NumberOfLeaves":4,"FeatureFraction":1,"LabelColumnName":"Label","FeatureColumnName":"Features"}
e3: {"NumberOfLeaves":4,"MinimumExampleCountPerLeaf":20,"LearningRate":1,"NumberOfTrees":4,"SubsampleFraction":1,"MaximumBinCountPerFeature":255,"FeatureFraction":1,"L1Regularization":2E-10,"L2Regularization":1,"LabelColumnName":"Label","FeatureColumnName":"Features"}
e4: {"L1Regularization":1,"L2Regularization":1,"LabelColumnName":"Label","FeatureColumnName":"Features"}
e5: {"L1Regularization":1,"L2Regularization":0.1,"LabelColumnName":"Label","FeatureColumnName":"Features"}


In [15]:
Console.WriteLine($"F1 Score during training: {result.Metric}");

// If the model supports calibration, we could use Evaluate instead
var evalResults = context.BinaryClassification.EvaluateNonCalibrated(model.Transform(split.TestSet), labelColumnName: "Label");

// Let's see how it performed
MLCharts.ClassificationReport(evalResults)

F1 Score during training: 0.6135292511763101


In [16]:
// Save the model
context.Model.Save(model, ((IDataView)df).Schema, $"models/CustomPipelineAuto.zip");

// Record the model
modelTracker.Register("CustomPipelineAuto", evalResults).ToDataFrame()

index,Model,F1 Score,Accuracy,Positive Precision,Positive Recall,Negative Precision,Negative Recall,AUC,AUCPR
0,TextFeaturizerAuto,0.6399999999999999,0.7857142857142857,1.0,0.4705882352941176,0.7352941176470589,1.0,0.76,0.7719278760720389
1,CustomPipelineAuto,0.6666666666666667,0.7857142857142857,0.9,0.5294117647058824,0.75,0.96,0.7823529411764706,0.7880012118161495


## Deep Learning
This uses TorchSharp and Roberta to fine-tune a model onto our text examples.

In ML.NET this is only available as multi-class classification, so we'll run this as a multi-class classification experiment even though there are only 2 classes.

In [17]:
var dfMultiClass = df.Clone();
dfMultiClass["Label"] = new PrimitiveDataFrameColumn<uint>("Label", dfMultiClass["Label"].Cast<bool>().Select(x => x ? (uint)1 : (uint)0));
dfMultiClass.Sample(5)

index,Label,Message,IsMerge,WorkItems,TotalFiles,ModifiedFiles,AddedFiles,DeletedFiles,TotalLines,NetLines,AddedLines,DeletedLines,HasAddedFiles,HasDeletedFiles,MessageLength,WordCount
0,0,Update package.json,False,0,1,1,0,0,275,0,1,1,False,False,19,2
1,0,cleanup public api surface,False,0,9,8,1,0,1859,14,50,36,True,False,26,4
2,1,Temporarily remove parent-pid launch argument for Kusto,False,0,1,1,0,0,84,0,1,1,False,False,55,7
3,0,serialize tabular data resource,False,0,2,2,0,0,66,3,4,1,False,False,31,4
4,1,fix devcontainer dotnet image tag,False,0,2,2,0,0,92,0,3,3,False,False,33,5


In [18]:
TorchSharp.torch.InitializeDeviceType(DeviceType.CPU);

MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true
};

var split = context.Data.TrainTestSplit(dfMultiClass, testFraction: 0.2, seed: seed);

var valueToKey = context.Transforms.Conversion.MapValueToKey(outputColumnName: "Label", inputColumnName: "Label");
var bert = context.MulticlassClassification.Trainers.TextClassification(sentence1ColumnName: "Message", architecture: BertArchitecture.Roberta, validationSet: split.TestSet);
var keyToValue = context.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel");

// Train our model using this pipeline
var pipeline = valueToKey.Append(bert).Append(keyToValue);
ITransformer model = pipeline.Fit(split.TrainSet);

// Get basics about the model
#!transformer-vis model -n
model

In [19]:
var parameter = result.TrialSettings.Parameter;

foreach (var key in parameter.Keys.Where(k => k[0] != '_')) {
    Console.WriteLine($"{key}: {parameter[key]}");
}

e0: {}
e1: {"NumberOfLeaves":4,"MinimumExampleCountPerLeaf":20,"NumberOfTrees":4,"MaximumBinCountPerFeature":255,"FeatureFraction":1,"LearningRate":0.09999999999999998,"LabelColumnName":"Label","FeatureColumnName":"Features","DiskTranspose":false}
e2: {"NumberOfTrees":4,"NumberOfLeaves":4,"FeatureFraction":1,"LabelColumnName":"Label","FeatureColumnName":"Features"}
e3: {"NumberOfLeaves":4,"MinimumExampleCountPerLeaf":20,"LearningRate":1,"NumberOfTrees":4,"SubsampleFraction":1,"MaximumBinCountPerFeature":255,"FeatureFraction":1,"L1Regularization":2E-10,"L2Regularization":1,"LabelColumnName":"Label","FeatureColumnName":"Features"}
e4: {"L1Regularization":1,"L2Regularization":1,"LabelColumnName":"Label","FeatureColumnName":"Features"}
e5: {"L1Regularization":1,"L2Regularization":0.1,"LabelColumnName":"Label","FeatureColumnName":"Features"}


In [20]:
var evalResults = context.MulticlassClassification.Evaluate(model.Transform(split.TestSet));

evalResults

index,value
LogLoss,11.812996704432416
LogLossReduction,-17.419893242596178
MacroAccuracy,0.6008064516129032
MicroAccuracy,0.7127659574468085
TopKAccuracy,0
TopKPredictionCount,0
TopKAccuracyForAllK,<null>
PerClassLogLoss,"[ 3.9769718408481096, 26.995294877627025 ]"
ConfusionMatrix,"Microsoft.ML.Data.ConfusionMatrixPerClassPrecision[ 0.7108433734939759, 0.7272727272727273 ]PerClassRecall[ 0.9516129032258065, 0.25 ]Countsindexvalue0[ 59, 3 ]1[ 24, 8 ]NumberOfClasses2"
,

index,value
PerClassPrecision,"[ 0.7108433734939759, 0.7272727272727273 ]"
PerClassRecall,"[ 0.9516129032258065, 0.25 ]"
Counts,"indexvalue0[ 59, 3 ]1[ 24, 8 ]"
index,value
0,"[ 59, 3 ]"
1,"[ 24, 8 ]"
NumberOfClasses,2

index,value
0,"[ 59, 3 ]"
1,"[ 24, 8 ]"


In [21]:
MLCharts.RenderConfusionMatrix(evalResults.ConfusionMatrix, classNames: new List<string> {"Non-Bugfix", "Bugfix"})

In [22]:
// Save the model
context.Model.Save(model, split.TrainSet.Schema, "models/Roberta.zip");

// Calculate raw counts
double tp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 1);
double tn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 0);
double fp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 0);
double fn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 1);

// Record the model
modelTracker.Register("Roberta", tp, fn, fp, tn).ToDataFrame()

index,Model,F1 Score,Accuracy,Positive Precision,Positive Recall,Negative Precision,Negative Recall,AUC,AUCPR
0,TextFeaturizerAuto,0.6399999999999999,0.7857142857142857,1.0,0.4705882352941176,0.7352941176470589,1.0,0.76,0.7719278760720389
1,CustomPipelineAuto,0.6666666666666667,0.7857142857142857,0.9,0.5294117647058824,0.75,0.96,0.7823529411764706,0.7880012118161495
2,Roberta,0.3720930232558139,0.7127659574468085,0.7272727272727273,0.25,0.7108433734939759,0.9516129032258064,0.6008064516129032,0.7190580503833516
