Skip to content

Commit

Permalink
Fixing bugs in the finetuning functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
TommiNieminen committed Dec 21, 2020
1 parent f2e3f15 commit 473ca3f
Show file tree
Hide file tree
Showing 18 changed files with 131 additions and 282 deletions.
21 changes: 14 additions & 7 deletions FiskmoTranslationProvider/FileReader.cs
@@ -1,6 +1,4 @@


using Sdl.Core.Globalization;
using Sdl.Core.Globalization;
using Sdl.FileTypeSupport.Framework.BilingualApi;
using Sdl.LanguagePlatform.TranslationMemory;
using Sdl.LanguagePlatform.TranslationMemoryApi;
Expand Down Expand Up @@ -42,16 +40,25 @@ public override void ProcessParagraphUnit(IParagraphUnit paragraphUnit)
{
if (segmentPair.Properties.ConfirmationLevel == ConfirmationLevel.Translated ||
segmentPair.Properties.ConfirmationLevel == ConfirmationLevel.ApprovedTranslation ||
segmentPair.Properties.ConfirmationLevel == ConfirmationLevel.ApprovedSignOff)
segmentPair.Properties.ConfirmationLevel == ConfirmationLevel.ApprovedSignOff ||
(segmentPair.Properties.ConfirmationLevel == ConfirmationLevel.Draft && segmentPair.Properties.TranslationOrigin.MatchPercent == 100))
{
this.sourceVisitor.Reset();
segmentPair.Source.AcceptVisitor(this.sourceVisitor);
this.targetVisitor.Reset(this.sourceVisitor.TagStarts);
segmentPair.Target.AcceptVisitor(this.targetVisitor);

FileTranslations.Add(new Tuple<string, string>(
this.sourceVisitor.PlainText,
this.targetVisitor.PlainText));
//Add translation only if there's actual text content on both sides (not just tags)
if (this.sourceVisitor.SegmentContainsText && this.targetVisitor.SegmentContainsText)
{
FileTranslations.Add(new Tuple<string, string>(
this.sourceVisitor.PlainText,
this.targetVisitor.PlainText));
}
else
{

}
}
else
{
Expand Down
72 changes: 42 additions & 30 deletions FiskmoTranslationProvider/FinetuneBatchTask.cs
Expand Up @@ -16,9 +16,9 @@
namespace FiskmoTranslationProvider
{

[AutomaticTask("FiskmoBatchTask",
"Fiskmo fine-tune and translate",
"Task for fine-tuning Fiskmo models with project data, also support batch translation. IMPORTANT: Segment the files before running this task by opening them in the editor and saving, or by running Pretranslate or Pseudotranslate tasks.",
[AutomaticTask("OPUSCATBatchTask",
"OPUS-CAT finetune and preorder machine translation",
"Task for finetuning OPUS MT models with project data, with optional preordering of MT for new segments in project (makes fetching MT much quicker during translation). IMPORTANT: Segment the files before running this task by opening them in the editor and saving, or by running Pretranslate or Pseudotranslate tasks.",
GeneratedFileType = AutomaticTaskFileType.None)]
//[TODO] You can change the file type according to your needs
[AutomaticTaskSupportedFileType(AutomaticTaskFileType.BilingualTarget)]
Expand Down Expand Up @@ -173,6 +173,33 @@ private void BatchTranslate()
}
}

private List<Tuple<string, string>> ExtractFromTm(
List<ITranslationMemoryLanguageDirection> tms,
List<string> uniqueNewSegments)
{

//assign fuzzy min and all above percentage divisible by ten as fuzzybands
var fuzzyBands = Enumerable.Range(settings.FuzzyMinPercentage, 100).Where(
x => (x % 10 == 0 && x <= 100) || x == settings.FuzzyMinPercentage).ToList();

var transUnitExtractor =
new FinetuneTransUnitExtractor(
tms,
uniqueNewSegments,
fuzzyBands,
this.settings.MaxFinetuningSentences,
this.settings.ConcordanceMaxResults,
this.settings.FuzzyMaxResults,
this.settings.MaxConcordanceWindow);

transUnitExtractor.Extract(
this.settings.ExtractFuzzies,
this.settings.ExtractConcordanceMatches,
this.settings.ExtractFillerUnits);

return transUnitExtractor.AllExtractedTranslations;
}

private void Finetune()
{
var projectInfo = this.Project.GetProjectInfo();
Expand All @@ -187,26 +214,18 @@ private void Finetune()
var uniqueProjectTranslations = this.ProjectTranslations[targetLang].Distinct().ToList();
List<string> uniqueNewSegments = this.ProjectNewSegments[targetLang].Distinct().ToList();

//assign fuzzy min and all above percentage divisible by ten as fuzzybands
var fuzzyBands = Enumerable.Range(settings.FuzzyMinPercentage, 100).Where(
x => (x % 10 == 0 && x <= 100) || x == settings.FuzzyMinPercentage).ToList();

var transUnitExtractor =
new FinetuneTransUnitExtractor(
this.tms[targetLang],
uniqueNewSegments,
fuzzyBands,
settings.MaxFinetuningSentences,
settings.ConcordanceMaxResults,
settings.FuzzyMaxResults,
settings.MaxConcordanceWindow);

transUnitExtractor.Extract(
this.settings.ExtractFuzzies,
this.settings.ExtractConcordanceMatches,
this.settings.ExtractFillerUnits);
List<Tuple<string, string>> finetuneSet;
if (this.tms[targetLang].Any())
{
var tmExtracts = this.ExtractFromTm(this.tms[targetLang], uniqueNewSegments);
finetuneSet = uniqueProjectTranslations.Union(tmExtracts).ToList();
}
else
{
finetuneSet = uniqueProjectTranslations;
}

var finetuneSet = uniqueProjectTranslations.Union(transUnitExtractor.AllExtractedTranslations).ToList();


if (finetuneSet.Count() < FiskmoTpSettings.Default.FinetuningMinSentencePairs)
{
Expand All @@ -225,14 +244,7 @@ private void Finetune()
this.fiskmoOptions.modelTag,
this.settings.IncludePlaceholderTags,
this.settings.IncludeTagPairs);

switch (result)
{
case "fine-tuning already in process":
throw new Exception("MT engine is currently batch translating or fine-tuning, wait for previous job to finish (or cancel it by restarting MT engine).");
default:
break;
}

}
}

Expand Down
23 changes: 16 additions & 7 deletions FiskmoTranslationProvider/FiskmoMarkupDataVisitor.cs
Expand Up @@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace FiskmoTranslationProvider
Expand All @@ -12,39 +13,42 @@ public class FiskmoMarkupDataVisitor : IMarkupDataVisitor
{
private StringBuilder plainText;
private Dictionary<string, ITagPair> sourceTagStarts;
private bool _segmentContainsText = false;

public Dictionary<string, IPlaceholderTag> Placeholders { get; set; }
public string PlainText { get => plainText.ToString(); }
public Dictionary<string, ITagPair> TagStarts { get; set; }
public Dictionary<string, ITagPair> TagEnds { get; set; }
public bool SegmentContainsText { get => _segmentContainsText; internal set => _segmentContainsText = value; }

public void Reset()
{
this.SegmentContainsText = false;
this.plainText = new StringBuilder();
this.Placeholders = new Dictionary<string, IPlaceholderTag>();
this.TagStarts = new Dictionary<string, ITagPair>();
this.TagEnds = new Dictionary<string, ITagPair>();
}
}


public void VisitCommentMarker(ICommentMarker commentMarker)
{

}

public void VisitLocationMarker(ILocationMarker location)
{

}

public void VisitLockedContent(ILockedContent lockedContent)
{

}

public void VisitOtherMarker(IOtherMarker marker)
{

}

public void VisitPlaceholderTag(IPlaceholderTag tag)
Expand All @@ -58,7 +62,7 @@ public void VisitPlaceholderTag(IPlaceholderTag tag)

public void VisitRevisionMarker(IRevisionMarker revisionMarker)
{

}

private void VisitChildren(IAbstractMarkupDataContainer container)
Expand Down Expand Up @@ -116,7 +120,12 @@ public void VisitTagPair(ITagPair tagPair)

public void VisitText(IText text)
{
this.plainText.Append(text.ToString());
string textString = text.ToString();
if (Regex.IsMatch(textString, @"[^\s]"))
{
this.SegmentContainsText = true;
}
this.plainText.Append(textString);
}

internal void Reset(Dictionary<string, ITagPair> tagStarts)
Expand Down
22 changes: 19 additions & 3 deletions FiskmoTranslationProvider/FiskmoProvider.cs
Expand Up @@ -198,6 +198,13 @@ private static void DocChanged(object sender, DocumentEventArgs e)
private static void segmentChanged(FiskmoOptions options, LanguageDirection langDir, object sender, EventArgs e)
{
var doc = (Document)sender;

//There are some "segments" the Trados editor view which are not proper segments, like
//the start of document tag
if (doc.ActiveSegmentPair == null)
{
return;
}
var visitor = new FiskmoMarkupDataVisitor();

var activeFiskmoOptions = FiskmoProvider.GetProjectFiskmoOptions(doc.Project, langDir);
Expand All @@ -218,11 +225,20 @@ private static void segmentChanged(FiskmoOptions options, LanguageDirection lang
//TESTED: doesn't seem slow at all, probably the translation part later that causes delay.
var nextSegmentPairs = doc.SegmentPairs.SkipWhile(x =>
!(x.Properties.Id == doc.ActiveSegmentPair.Properties.Id &&
x.GetParagraphUnitProperties().ParagraphUnitId == doc.ActiveSegmentPair.GetParagraphUnitProperties().ParagraphUnitId)).Take(options.pregenerateSegmentCount);
x.GetParagraphUnitProperties().ParagraphUnitId == doc.ActiveSegmentPair.GetParagraphUnitProperties().ParagraphUnitId));

var segmentsNeeded = options.pregenerateSegmentCount;
foreach (var segmentPair in nextSegmentPairs)
{
if (segmentPair.Properties.ConfirmationLevel == Sdl.Core.Globalization.ConfirmationLevel.Unspecified)
if (segmentsNeeded == 0)
{
break;
}

//Also preorder translations for Draft segments, since quite often there will be draft content
//provided in segments where having MT is still desirable. This could also be an option.
if (segmentPair.Properties.ConfirmationLevel == Sdl.Core.Globalization.ConfirmationLevel.Unspecified ||
segmentPair.Properties.ConfirmationLevel == Sdl.Core.Globalization.ConfirmationLevel.Draft)
{
visitor.Reset();
segmentPair.Source.AcceptVisitor(visitor);
Expand All @@ -234,7 +250,7 @@ private static void segmentChanged(FiskmoOptions options, LanguageDirection lang

//The preorder method doesn't wait for the translation, so the requests return quicker
FiskmöMTServiceHelper.PreOrder(options, sourceText, sourceCode, targetCode, options.modelTag);

segmentsNeeded -= 1;
}
}
}
Expand Down

0 comments on commit 473ca3f

Please sign in to comment.