# Text Classification

This notebook is for specialized text classification, including some using deep learning via TorchSharp.

This is kept in its own notebook because it uses CUDA 12 and Linux

## Dependencies

In [1]:
#r "nuget:Microsoft.Data.Analysis"
#r "nuget:Microsoft.ML"
#r "nuget:Microsoft.ML.AutoML"
#r "nuget:Microsoft.ML.TorchSharp"
#r "nuget:Newtonsoft.Json"
#r "nuget:Plotly.NET"
#r "nuget:Plotly.NET.Interactive"
#r "nuget:libtorch-cpu-linux-x64, 2.1.0.1"
#r "nuget:TorchSharp,0.101.5"

using Microsoft.DotNet.Interactive.Formatting;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.AutoML;
using Microsoft.ML.AutoML.CodeGen;
using Microsoft.ML.SearchSpace;
using Microsoft.ML.SearchSpace.Option;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.Text;
using Microsoft.ML.TorchSharp;
using Microsoft.ML.TorchSharp.NasBert;
using TorchSharp;
using Newtonsoft.Json;
using System.Reflection;

Loading extensions from `/home/matteland/.nuget/packages/plotly.net.interactive/5.0.0/lib/netstandard2.1/Plotly.NET.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/microsoft.data.analysis/0.21.1/interactive-extensions/dotnet/Microsoft.Data.Analysis.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/microsoft.ml.automl/0.21.1/interactive-extensions/dotnet/Microsoft.ML.AutoML.Interactive.dll`

Loading extensions from `/home/matteland/.nuget/packages/skiasharp/2.88.6/interactive-extensions/dotnet/SkiaSharp.DotNet.Interactive.dll`

In [3]:
//#r "nuget:MattEland.ML"
//#r "nuget:MattEland.ML.Charts"
//#r "nuget:MattEland.ML.DataFrames"
//#r "nuget:MattEland.ML.Interactive"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML/bin/Debug/net8.0/MattEland.ML.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.DataFrames/bin/Debug/net8.0/MattEland.ML.DataFrames.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.Charts/bin/Debug/net8.0/MattEland.ML.Charts.dll"
#r "/home/matteland/Documents/MattEland.ML/MattEland.ML/MattEland.ML.Interactive/bin/Debug/net8.0/MattEland.ML.Interactive.dll"

using MattEland.ML;
using MattEland.ML.Charts;
using MattEland.ML.DataFrames;
using MattEland.ML.Interactive;

await MattEland.ML.Interactive.InteractiveExtensions.Load(Microsoft.DotNet.Interactive.KernelInvocationContext.Current.HandlingKernel.RootKernel);

## Data Loading / Splitting

In [4]:
DataFrame df = DataFrame.LoadCsv("data/Training.csv", separator: ',', header: true);
df.Columns.Remove("PredictedLabel", "Reasoning", "AuthorId", "AuthorDateUtc", "CommitterId", "CommitterDateUtc", "ParentSha", "Parent2Sha", "DayOfWeek", "Month", "Quarter", "Year", "Hour", "TimeOfDay", "IsWeekend", "Sha", "Source");
df["ActualLabel"].SetName("Label");
df.Sample(5)

index,Label,Message,IsMerge,WorkItems,TotalFiles,ModifiedFiles,AddedFiles,DeletedFiles,TotalLines,NetLines,AddedLines,DeletedLines,HasAddedFiles,HasDeletedFiles,MessageLength,WordCount
0,True,revert dependency on sdk acquisition and cache location of dotnet tool (#1094),False,1,6,6,0,0,1703,42,64,22,False,False,78,12
1,False,update system drawing,False,0,1,1,0,0,47,0,1,1,False,False,21,3
2,False,compute cursor start for replacement,False,0,1,1,0,0,238,1,2,1,False,False,36,5
3,False,Updated branch version (#5400),False,1,1,1,0,0,40,0,2,2,False,False,30,4
4,False,Merged PR 1684342: support for utf8 encoding and emoji,False,0,1,1,0,0,1,0,1,1,False,False,54,9


In [5]:
int seed = 42;
MLContext context = new(seed: seed);

var split = context.Data.TrainTestSplit(df, testFraction: 0.3, seed: seed);
var colTypes = df.GetColumnTypes(excludedColumns: new[] { "Label" });
colTypes

Unnamed: 0,Unnamed: 1
Text,[ Message ]
Numeric,"[ WorkItems, TotalFiles, ModifiedFiles, AddedFiles, DeletedFiles, TotalLines, NetLines, AddedLines, DeletedLines, MessageLength, WordCount ]"
Categorical,"[ IsMerge, HasAddedFiles, HasDeletedFiles ]"
Excluded,[ Label ]


## Simple Featurizer

In [11]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};
var featurizer = context.Auto().Featurizer(df);

// The classifier step tells AutoML what model trainers are enabled. We'll focus on those that don't require scaled data for simplicity at the moment
var classifier = context.Auto().BinaryClassification(
    useFastForest: true, 
    useLgbm: true, 
    useFastTree: true, 
    useLbfgsLogisticRegression: false, 
    useSdcaLogisticRegression: false);

// Now let's run our experiment using our custom pipeline
var experiment = context.Auto().CreateExperiment()
    .SetPipeline(featurizer.Append(classifier))
    .SetDataset(split)
    .SetBinaryClassificationMetric(BinaryClassificationMetric.F1Score, labelColumn: "Label")
    .SetMaxModelToExplore(10);

TrialResult result = await experiment.RunAsync();
ITransformer model = result.Model;

var scorer = model.Transform(split.TestSet);

// If the model supports calibration, we could use Evaluate instead
var evalResults = context.BinaryClassification.EvaluateNonCalibrated(model.Transform(split.TestSet), labelColumnName: "Label");

// Let's see how it performed
MLCharts.ClassificationReport(evalResults)

In [12]:
model

In [13]:
#!transformer-vis model -d 1 -n

In [14]:
var enumTransformer = ((IEnumerable<Microsoft.ML.ITransformer>) model);
var textTransformer = enumTransformer.ToList()[2]; 
#!transformer-vis textTransformer -d 2

## Custom Pipeline

The prior model did fine, but we'd like more control over the text transformation, so we'll provide our own complex pipeline for text processing.

In [15]:
MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true,
};
ContextMonitor contextMonitor = context.Monitor();

// Standardize our numeric colums via scaling and imputing missing values
MissingValueReplacingEstimator imputer = context.Transforms.ReplaceMissingValues(columns: colTypes.Numeric.Select(c => new InputOutputColumnPair(c, c)).ToArray(), replacementMode: MissingValueReplacingEstimator.ReplacementMode.DefaultValue);
NormalizingEstimator scaler = context.Transforms.NormalizeRobustScaling(columns: colTypes.Numeric.Select(c => new InputOutputColumnPair(c, c)).ToArray());

// Standardize our boolean columns as singles
TypeConvertingEstimator boolConverter = context.Transforms.Conversion.ConvertType(columns: colTypes.Categorical.Select(c => new InputOutputColumnPair(c, c)).ToArray(), outputKind: DataKind.Single);
    
// Text pre-processing
TextNormalizingEstimator textNormalizer = context.Transforms.Text.NormalizeText(inputColumnName: "Message", outputColumnName: "Message", caseMode: TextNormalizingEstimator.CaseMode.Lower, keepDiacritics: false, keepPunctuations: false, keepNumbers: false);

// Word trigrams / bigrams / unigrams
WordTokenizingEstimator wordTokenizer = context.Transforms.Text.TokenizeIntoWords(inputColumnName: "Message", outputColumnName: "MessageWords");
StopWordsRemovingEstimator stopRemover = context.Transforms.Text.RemoveDefaultStopWords(inputColumnName: "MessageWords", outputColumnName: "MessageWords", language: StopWordsRemovingEstimator.Language.English);
ValueToKeyMappingEstimator labelConverter = context.Transforms.Conversion.MapValueToKey(inputColumnName: "MessageWords", outputColumnName: "MessageWords");
NgramExtractingEstimator ngramExtractor = context.Transforms.Text.ProduceNgrams(inputColumnName: "MessageWords", outputColumnName: "MessageWords", ngramLength: 3, useAllLengths: true, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf);
LpNormNormalizingEstimator wordNorm = context.Transforms.NormalizeLpNorm(inputColumnName: "MessageWords", outputColumnName: "MessageWords", norm: LpNormNormalizingEstimator.NormFunction.L2);

// Character ngrams
TokenizingByCharactersEstimator charTokenizer = context.Transforms.Text.TokenizeIntoCharactersAsKeys(inputColumnName: "Message", outputColumnName: "MessageChars");
NgramExtractingEstimator charNgram = context.Transforms.Text.ProduceNgrams(inputColumnName: "MessageChars", outputColumnName: "MessageChars", ngramLength: 3, useAllLengths: true, skipLength: 1);
LpNormNormalizingEstimator charNorm = context.Transforms.NormalizeLpNorm(inputColumnName: "MessageChars", outputColumnName: "MessageChars", norm: LpNormNormalizingEstimator.NormFunction.L2);

// We'll concatenate the word and Ngram features together, along with all of our numeric and boolean columns
ColumnConcatenatingEstimator concat = context.Transforms.Concatenate("Features", inputColumnNames: colTypes.Numeric.Concat(colTypes.Categorical).Concat(new[] { "MessageWords", "MessageChars"}).ToArray());

// Since we have scaling in place, let's use all available classifiers
var classifier = context.Auto().BinaryClassification(
    useFastForest: true, 
    useLgbm: true, 
    useFastTree: true, 
    useLbfgsLogisticRegression: true, 
    useSdcaLogisticRegression: true);

// Build a pipeline
SweepablePipeline pipeline = imputer
    .Append(boolConverter)
    .Append(scaler)
    .Append(textNormalizer)
    .Append(wordTokenizer)
    .Append(stopRemover)
    .Append(labelConverter)
    .Append(ngramExtractor)
    .Append(wordNorm)
    .Append(charTokenizer)
    .Append(charNgram)
    .Append(charNorm)
    .Append(concat)
    .Append(classifier);

// Now let's run our experiment using our custom pipeline
var experiment = context.Auto().CreateExperiment()
    .SetPipeline(pipeline)
    .SetDataset(split)
    .SetBinaryClassificationMetric(BinaryClassificationMetric.F1Score, labelColumn: "Label")
    .SetMaxModelToExplore(10);

TrialResult result = await experiment.RunAsync();
ITransformer model = result.Model;

MLCharts.MetricImprovementWithTrials(contextMonitor)

In [16]:
ITransformer model = result.Model;
var scorer = model.Transform(split.TestSet);

// If the model supports calibration, we could use Evaluate instead
var evalResults = context.BinaryClassification.EvaluateNonCalibrated(model.Transform(split.TestSet), labelColumnName: "Label");

// Let's see how it performed
MLCharts.ClassificationReport(evalResults)

In [17]:
model

In [22]:
#!transformer-vis model -d 2 -n

## Deep Learning
This uses TorchSharp and Roberta to fine-tune a model onto our text examples.

In ML.NET this is only available as multi-class classification, so we'll run this as a multi-class classification experiment even though there are only 2 classes.

In [46]:
var dfMultiClass = df.Clone();
dfMultiClass["Label"] = new PrimitiveDataFrameColumn<uint>("Label", dfMultiClass["Label"].Cast<bool>().Select(x => x ? (uint)1 : (uint)0));
dfMultiClass.Sample(5)

index,Label,Message,IsMerge,WorkItems,TotalFiles,ModifiedFiles,AddedFiles,DeletedFiles,TotalLines,NetLines,AddedLines,DeletedLines,HasAddedFiles,HasDeletedFiles,MessageLength,WordCount
0,0,add copyright header,False,0,2,2,0,0,148,6,6,0,False,False,20,3
1,1,Better author testing,False,0,7,5,2,0,356,55,122,67,True,False,21,3
2,0,Mark Microsoft.Extensions.ML as a stable package. (#4356),False,1,2,2,0,0,57,2,2,0,False,False,57,7
3,0,Update dependencies from https://github.com/dotnet/arcade build 20200805.7,False,0,5,5,0,0,1010,36,43,7,False,False,74,6
4,0,Update readme.md,False,0,1,1,0,0,9,0,1,1,False,False,16,2


In [47]:
TorchSharp.torch.InitializeDeviceType(DeviceType.CPU);

MLContext context = new(seed: seed) {
    GpuDeviceId = 0,
    FallbackToCpu = true
};

var split = context.Data.TrainTestSplit(dfMultiClass, testFraction: 0.3, seed: seed);

var valueToKey = context.Transforms.Conversion.MapValueToKey(outputColumnName: "Label", inputColumnName: "Label");
var bert = context.MulticlassClassification.Trainers.TextClassification(sentence1ColumnName: "Message", architecture: BertArchitecture.Roberta, validationSet: split.TestSet);
var keyToValue = context.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel");

// Train our model using this pipeline
var pipeline = valueToKey.Append(bert).Append(keyToValue);
ITransformer model = pipeline.Fit(split.TrainSet);

// Get basics about the model
#!transformer-vis model -n
model

In [48]:
var scorer = model.Transform(split.TestSet);
var evalResults = context.MulticlassClassification.Evaluate(model.Transform(split.TestSet));

evalResults

index,value
LogLoss,13.648578322890906
LogLossReduction,-20.72279937042476
MacroAccuracy,0.5989684074790458
MicroAccuracy,0.6986301369863014
TopKAccuracy,0
TopKPredictionCount,0
TopKAccuracyForAllK,<null>
PerClassLogLoss,"[ 6.748459395132777, 28.182871383487818 ]"
ConfusionMatrix,"Microsoft.ML.Data.ConfusionMatrixPerClassPrecision[ 0.7310924369747899, 0.5555555555555556 ]PerClassRecall[ 0.8787878787878788, 0.3191489361702128 ]Countsindexvalue0[ 87, 12 ]1[ 32, 15 ]NumberOfClasses2"
,

index,value
PerClassPrecision,"[ 0.7310924369747899, 0.5555555555555556 ]"
PerClassRecall,"[ 0.8787878787878788, 0.3191489361702128 ]"
Counts,"indexvalue0[ 87, 12 ]1[ 32, 15 ]"
index,value
0,"[ 87, 12 ]"
1,"[ 32, 15 ]"
NumberOfClasses,2

index,value
0,"[ 87, 12 ]"
1,"[ 32, 15 ]"


In [49]:
MLCharts.RenderConfusionMatrix(evalResults.ConfusionMatrix, classNames: new List<string> {"Non-Bugfix", "Bugfix"})

In [50]:
double tp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 1);
double tn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 0);
double fp = evalResults.ConfusionMatrix.GetCountForClassPair(1, 0);
double fn = evalResults.ConfusionMatrix.GetCountForClassPair(0, 1);

double precision = tp / (tp + fp);
double recall = tp / (tp + fn);
double f1Score = 2 * (precision * recall) / (precision + recall);

new {precision, recall, f1Score}

Unnamed: 0,Unnamed: 1
precision,0.5555555555555556
recall,0.3191489361702128
f1Score,0.4054054054054054
