# Text Classification

This notebook is for specialized text classification, including some using deep learning via TorchSharp.

This is kept in its own notebook because it uses CUDA 12 and Linux

## Dependencies

In [1]:
#r "nuget:Microsoft.Data.Analysis"
#r "nuget:Microsoft.ML"
#r "nuget:Microsoft.ML.AutoML"
#r "nuget:Microsoft.ML.TorchSharp"
#r "nuget:Newtonsoft.Json"
#r "nuget:Plotly.NET"
#r "nuget:Plotly.NET.Interactive"
#r "nuget:libtorch-cpu-linux-x64, 2.1.0.1"
#r "nuget:TorchSharp,0.101.5"

using Microsoft.DotNet.Interactive.Formatting;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.AutoML;
using Microsoft.ML.AutoML.CodeGen;
using Microsoft.ML.SearchSpace;
using Microsoft.ML.SearchSpace.Option;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.Text;
using Microsoft.ML.TorchSharp;
using Microsoft.ML.TorchSharp.NasBert;
using TorchSharp;
using Newtonsoft.Json;
using System.Reflection;
using System.IO;

Loading extensions from `/home/matteland/.nuget/packages/plotly.net.interactive/5.0.0/lib/netstandard2.1/Plotly.NET.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/microsoft.data.analysis/0.21.1/interactive-extensions/dotnet/Microsoft.Data.Analysis.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/microsoft.ml.automl/0.21.1/interactive-extensions/dotnet/Microsoft.ML.AutoML.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/skiasharp/2.88.6/interactive-extensions/dotnet/SkiaSharp.DotNet.Interactive.dll`

In [2]:
//#r "nuget:MattEland.ML"
//#r "nuget:MattEland.ML.Charts"
//#r "nuget:MattEland.ML.DataFrames"
//#r "nuget:MattEland.ML.Interactive"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML/bin/Debug/net8.0/MattEland.ML.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.DataFrames/bin/Debug/net8.0/MattEland.ML.DataFrames.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.Charts/bin/Debug/net8.0/MattEland.ML.Charts.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.Interactive/bin/Debug/net8.0/MattEland.ML.Interactive.dll"

using MattEland.ML;
using MattEland.ML.Charts;
using MattEland.ML.DataFrames;
using MattEland.ML.Interactive;

await MattEland.ML.Interactive.InteractiveExtensions.Load(Microsoft.DotNet.Interactive.KernelInvocationContext.Current.HandlingKernel.RootKernel);

## Data Loading

In [3]:
DataFrame df = DataFrame.LoadCsv("data/Training.csv", separator: ',', header: true);
df.Columns.Remove("PredictedLabel", "Reasoning", "AuthorId", "AuthorDateUtc", "CommitterId", "CommitterDateUtc", "ParentSha", "Parent2Sha", "DayOfWeek", "Month", "Quarter", "Year", "Hour", "TimeOfDay", "IsWeekend", "Sha", "Source");
df["ActualLabel"].SetName("Label");
df.Sample(5)

index,Label,Message,IsMerge,WorkItems,TotalFiles,ModifiedFiles,AddedFiles,DeletedFiles,TotalLines,NetLines,AddedLines,DeletedLines,HasAddedFiles,HasDeletedFiles,MessageLength,WordCount
0,False,Lone wolf peek is now fully implemented,False,0,10,7,2,1,387,91,104,13,True,True,39,7
1,False,Updated documentation,False,0,1,1,0,0,45,1,2,1,False,False,21,2
2,True,fix find kernel on composite kernel (#2400),False,1,5,5,0,0,1030,50,52,2,False,False,43,7
3,True,Simplified encounter generation code,False,0,1,1,0,0,180,-6,21,27,False,False,36,4
4,True,Fix bug with order of pixels in the Interleave=true case (#2130),False,1,2,2,0,0,1124,0,4,4,False,False,64,11


In [4]:
int seed = 42;
var colTypes = df.GetColumnTypes(excludedColumns: new[] { "Label" });
colTypes

Unnamed: 0,Unnamed: 1
Text,[ Message ]
Numeric,"[ WorkItems, TotalFiles, ModifiedFiles, AddedFiles, DeletedFiles, TotalLines, NetLines, AddedLines, DeletedLines, MessageLength, WordCount ]"
Categorical,"[ IsMerge, HasAddedFiles, HasDeletedFiles ]"
Excluded,[ Label ]


In [5]:
// Create a custom model tracker to record the various experiments we run
BinaryClassificationModelTracker modelTracker = new();

// Although the metric we probably care the most about is the Precision, we're going to focus on F1 Score during model training in order to encourage discovering the most balanced models between precision and recall
modelTracker.DefaultMetric = BinaryClassificationMetric.F1Score;
modelTracker

Unnamed: 0,Unnamed: 1
DefaultMetric,F1Score
Count,0


## Simple Featurizer

In [7]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};
var featurizer = context.Auto().Featurizer(df);

// The classifier step tells AutoML what model trainers are enabled. We'll focus on those that don't require scaled data for simplicity at the moment
var classifier = context.Auto().BinaryClassification(
    useFastForest: true, 
    useLgbm: true, 
    useFastTree: true, 
    useLbfgsLogisticRegression: false, 
    useSdcaLogisticRegression: false);

// Now let's run our experiment using our custom pipeline
var experiment = context.Auto().CreateExperiment()
    .SetPipeline(featurizer.Append(classifier))
    .SetDataset(df, fold: 5)
    .SetBinaryClassificationMetric(BinaryClassificationMetric.F1Score, labelColumn: "Label")
    .SetMaxModelToExplore(10);

TrialResult result = await experiment.RunAsync();
ITransformer model = result.Model;

var scorer = model.Transform(df);

// If the model supports calibration, we could use Evaluate instead
var evalResults = context.BinaryClassification.EvaluateNonCalibrated(model.Transform(df), labelColumnName: "Label");

// Let's see how it performed
MLCharts.ClassificationReport(evalResults)

In [8]:
#!transformer-vis model -d 1 -n

model

In [9]:
var enumTransformer = ((IEnumerable<Microsoft.ML.ITransformer>) model);
var textTransformer = enumTransformer.ToList()[2]; 
#!transformer-vis textTransformer -d 2

In [10]:
// Save the model
context.Model.Save(model, ((IDataView)df).Schema, $"models/TextFeaturizerAuto.zip");

// Record the model
modelTracker.Register("TextFeaturizerAuto", metrics).ToDataFrame()

index,Model,F1 Score,Accuracy,Positive Precision,Positive Recall,Negative Precision,Negative Recall,AUC,AUCPR
0,TextFeaturizerAuto,0.6725663716814161,0.8517034068136272,0.9382716049382716,0.5241379310344828,0.8349282296650717,0.9858757062146892,0.879972725501656,0.8034930448327863


## Custom Pipeline

The prior model did fine, but we'd like more control over the text transformation, so we'll provide our own complex pipeline for text processing.

In [12]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};
ContextMonitor contextMonitor = context.Monitor();

// Standardize our numeric colums via scaling and imputing missing values
MissingValueReplacingEstimator imputer = context.Transforms.ReplaceMissingValues(columns: colTypes.Numeric.Select(c => new InputOutputColumnPair(c, c)).ToArray(), replacementMode: MissingValueReplacingEstimator.ReplacementMode.DefaultValue);
NormalizingEstimator scaler = context.Transforms.NormalizeRobustScaling(columns: colTypes.Numeric.Select(c => new InputOutputColumnPair(c, c)).ToArray());

// Standardize our boolean columns as singles
TypeConvertingEstimator boolConverter = context.Transforms.Conversion.ConvertType(columns: colTypes.Categorical.Select(c => new InputOutputColumnPair(c, c)).ToArray(), outputKind: DataKind.Single);
    
// Text pre-processing
TextNormalizingEstimator textNormalizer = context.Transforms.Text.NormalizeText(inputColumnName: "Message", outputColumnName: "Message", caseMode: TextNormalizingEstimator.CaseMode.Lower, keepDiacritics: false, keepPunctuations: false, keepNumbers: false);

// Word trigrams / bigrams / unigrams
WordTokenizingEstimator wordTokenizer = context.Transforms.Text.TokenizeIntoWords(inputColumnName: "Message", outputColumnName: "MessageWords");
StopWordsRemovingEstimator stopRemover = context.Transforms.Text.RemoveDefaultStopWords(inputColumnName: "MessageWords", outputColumnName: "MessageWords", language: StopWordsRemovingEstimator.Language.English);
ValueToKeyMappingEstimator labelConverter = context.Transforms.Conversion.MapValueToKey(inputColumnName: "MessageWords", outputColumnName: "MessageWords");
NgramExtractingEstimator ngramExtractor = context.Transforms.Text.ProduceNgrams(inputColumnName: "MessageWords", outputColumnName: "MessageWords", ngramLength: 3, useAllLengths: true, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf);
LpNormNormalizingEstimator wordNorm = context.Transforms.NormalizeLpNorm(inputColumnName: "MessageWords", outputColumnName: "MessageWords", norm: LpNormNormalizingEstimator.NormFunction.L2);

// Character ngrams
TokenizingByCharactersEstimator charTokenizer = context.Transforms.Text.TokenizeIntoCharactersAsKeys(inputColumnName: "Message", outputColumnName: "MessageChars");
NgramExtractingEstimator charNgram = context.Transforms.Text.ProduceNgrams(inputColumnName: "MessageChars", outputColumnName: "MessageChars", ngramLength: 3, useAllLengths: true, skipLength: 1);
LpNormNormalizingEstimator charNorm = context.Transforms.NormalizeLpNorm(inputColumnName: "MessageChars", outputColumnName: "MessageChars", norm: LpNormNormalizingEstimator.NormFunction.L2);

// We'll concatenate the word and Ngram features together, along with all of our numeric and boolean columns
ColumnConcatenatingEstimator concat = context.Transforms.Concatenate("Features", inputColumnNames: colTypes.Numeric.Concat(colTypes.Categorical).Concat(new[] { "MessageWords", "MessageChars"}).ToArray());

// Since we have scaling in place, let's use all available classifiers
var classifier = context.Auto().BinaryClassification(
    useFastForest: true, 
    useLgbm: true, 
    useFastTree: true, 
    useLbfgsLogisticRegression: true, 
    useSdcaLogisticRegression: true);

// Build a pipeline
SweepablePipeline pipeline = imputer
    .Append(boolConverter)
    .Append(scaler)
    .Append(textNormalizer)
    .Append(wordTokenizer)
    .Append(stopRemover)
    .Append(labelConverter)
    .Append(ngramExtractor)
    .Append(wordNorm)
    .Append(charTokenizer)
    .Append(charNgram)
    .Append(charNorm)
    .Append(concat)
    .Append(classifier);

// Now let's run our experiment using our custom pipeline
var experiment = context.Auto().CreateExperiment()
    .SetPipeline(pipeline)
    .SetDataset(df, fold: 5)
    .SetBinaryClassificationMetric(BinaryClassificationMetric.F1Score, labelColumn: "Label")
    .SetMaxModelToExplore(10);

TrialResult result = await experiment.RunAsync();
ITransformer model = result.Model;

MLCharts.MetricImprovementWithTrials(contextMonitor)

In [13]:
ITransformer model = result.Model;
#!transformer-vis model -d 2

model

In [22]:
var scorer = model.Transform(df);

// If the model supports calibration, we could use Evaluate instead
var evalResults = context.BinaryClassification.EvaluateNonCalibrated(model.Transform(df), labelColumnName: "Label");

// Let's see how it performed
MLCharts.ClassificationReport(evalResults)

Error: System.InvalidOperationException: Could not apply a map over type 'UInt32' to column 'Label' since it has type 'Boolean'
   at Microsoft.ML.Transforms.ValueToKeyMappingTransformer.Mapper..ctor(ValueToKeyMappingTransformer parent, DataViewSchema inputSchema)
   at Microsoft.ML.Transforms.ValueToKeyMappingTransformer.MakeRowMapper(DataViewSchema schema)
   at Microsoft.ML.Data.RowToRowTransformerBase.GetOutputSchema(DataViewSchema inputSchema)
   at Microsoft.ML.Data.TransformerChain`1.GetOutputSchema(DataViewSchema inputSchema, TransformerScope scope)
   at Microsoft.ML.Data.TransformerChain`1.Transform(IDataView input, TransformerScope scope)
   at Submission#27.<<Initialize>>d__0.MoveNext()
--- End of stack trace from previous location ---
   at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)

In [15]:
// Save the model
context.Model.Save(model, ((IDataView)df).Schema, $"models/CustomPipelineAuto.zip");

// Record the model
modelTracker.Register("CustomPipelineAuto", evalResults).ToDataFrame()

index,Model,F1 Score,Accuracy,Positive Precision,Positive Recall,Negative Precision,Negative Recall,AUC,AUCPR
0,TextFeaturizerAuto,0.6725663716814161,0.8517034068136272,0.9382716049382716,0.5241379310344828,0.8349282296650717,0.9858757062146892,0.879972725501656,0.8034930448327863
1,CustomPipelineAuto,0.6725663716814161,0.8517034068136272,0.9382716049382716,0.5241379310344828,0.8349282296650717,0.9858757062146892,0.879972725501656,0.8034930448327863


## Deep Learning
This uses TorchSharp and Roberta to fine-tune a model onto our text examples.

In ML.NET this is only available as multi-class classification, so we'll run this as a multi-class classification experiment even though there are only 2 classes.

In [16]:
var dfMultiClass = df.Clone();
dfMultiClass["Label"] = new PrimitiveDataFrameColumn<uint>("Label", dfMultiClass["Label"].Cast<bool>().Select(x => x ? (uint)1 : (uint)0));
dfMultiClass.Sample(5)

index,Label,Message,IsMerge,WorkItems,TotalFiles,ModifiedFiles,AddedFiles,DeletedFiles,TotalLines,NetLines,AddedLines,DeletedLines,HasAddedFiles,HasDeletedFiles,MessageLength,WordCount
0,1,rework connect api,False,0,24,16,4,4,2135,61,317,256,True,True,18,3
1,0,"deferred command have token starting with label ""deferredCommand::""",False,0,4,4,0,0,1544,1,8,7,False,False,67,8
2,1,Address assembly reference issue and add test,False,0,2,2,0,0,632,23,25,2,False,False,45,7
3,0,enable #!share on js for vscode,False,0,1,1,0,0,58,2,2,0,False,False,31,6
4,0,Update dependencies from https://github.com/dotnet/arcade build 20230411.2 (#2908),False,1,3,3,0,0,169,0,6,6,False,False,82,7


In [17]:
TorchSharp.torch.InitializeDeviceType(DeviceType.CPU);

MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true
};

var split = context.Data.TrainTestSplit(dfMultiClass, testFraction: 0.3, seed: seed);

var valueToKey = context.Transforms.Conversion.MapValueToKey(outputColumnName: "Label", inputColumnName: "Label");
var bert = context.MulticlassClassification.Trainers.TextClassification(sentence1ColumnName: "Message", architecture: BertArchitecture.Roberta, validationSet: split.TestSet);
var keyToValue = context.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel");

// Train our model using this pipeline
var pipeline = valueToKey.Append(bert).Append(keyToValue);
ITransformer model = pipeline.Fit(split.TrainSet);

// Get basics about the model
#!transformer-vis model -n
model

In [18]:
var scorer = model.Transform(split.TestSet);
var evalResults = context.MulticlassClassification.Evaluate(model.Transform(split.TestSet));

evalResults

index,value
LogLoss,12.313935586506844
LogLossReduction,-18.598609164838216
MacroAccuracy,0.6662368364496024
MicroAccuracy,0.7671232876712328
TopKAccuracy,0
TopKPredictionCount,0
TopKAccuracyForAllK,<null>
PerClassLogLoss,"[ 6.883503274328664, 23.752505775988542 ]"
ConfusionMatrix,"Microsoft.ML.Data.ConfusionMatrixPerClassPrecision[ 0.7642276422764228, 0.782608695652174 ]PerClassRecall[ 0.9494949494949495, 0.3829787234042553 ]Countsindexvalue0[ 94, 5 ]1[ 29, 18 ]NumberOfClasses2"
,

index,value
PerClassPrecision,"[ 0.7642276422764228, 0.782608695652174 ]"
PerClassRecall,"[ 0.9494949494949495, 0.3829787234042553 ]"
Counts,"indexvalue0[ 94, 5 ]1[ 29, 18 ]"
index,value
0,"[ 94, 5 ]"
1,"[ 29, 18 ]"
NumberOfClasses,2

index,value
0,"[ 94, 5 ]"
1,"[ 29, 18 ]"


In [19]:
MLCharts.RenderConfusionMatrix(evalResults.ConfusionMatrix, classNames: new List<string> {"Non-Bugfix", "Bugfix"})

In [20]:
double tp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 1);
double tn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 0);
double fp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 0);
double fn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 1);

In [21]:
// Save the model
context.Model.Save(model, split.TrainSet.Schema, "models/Roberta.zip");

// Calculate raw counts
double tp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 1);
double tn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 0);
double fp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 0);
double fn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 1);

// Record the model
modelTracker.Register("Roberta", tp, fn, fp, tn).ToDataFrame()

index,Model,F1 Score,Accuracy,Positive Precision,Positive Recall,Negative Precision,Negative Recall,AUC,AUCPR
0,TextFeaturizerAuto,0.6725663716814161,0.8517034068136272,0.9382716049382716,0.5241379310344828,0.8349282296650717,0.9858757062146892,0.879972725501656,0.8034930448327863
1,CustomPipelineAuto,0.6725663716814161,0.8517034068136272,0.9382716049382716,0.5241379310344828,0.8349282296650717,0.9858757062146892,0.879972725501656,0.8034930448327863
2,Roberta,0.5142857142857142,0.7671232876712328,0.782608695652174,0.3829787234042553,0.7642276422764228,0.9494949494949496,0.6662368364496024,0.7734181689642984
