Skip to content

[RegexDiff X64] [danmoseley] Improve regex optimizer through investigation o ... #1826

@MihuBot

Description

@MihuBot

Job completed in 14 minutes 16 seconds (remote runner delay: 1 minute 17 seconds).
dotnet/runtime#125289
Using arguments: regexdiff
Main commit: dotnet/runtime@fdbedda
PR commit: danmoseley/runtime@1c43564

214 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\b(prima(\\s+(di|del(l['aoei])?|degli|dei)) ..." (212 uses)
[GeneratedRegex("\\b(prima(\\s+(di|del(l['aoei])?|degli|dei))?|entro\\s*(l['aoe]|il?|gli|i)?|(non\\s+dopo\\s+(il?|l[oae']|gli)|non\\s+più\\s+tardi\\s+(di|del(l['aoei])?|degli|dei)|termina(no)?(\\s+con)?(\\s+(il?|l[oae']|gli))?|precedente\\s+a((l(l['aoe])?)|gli|i)?|fino\\s+a((l(l['aoe])?)|gli|i)?))\\b", RegexOptions.ExplicitCapture | RegexOptions.Singleline)]
  ///                 ○ Match 'i'.<br/>
  ///     ○ Match a sequence of expressions.<br/>
  ///         ○ Match the string "non".<br/>
+   ///         ○ Match a whitespace character atomically at least once.<br/>
  ///         ○ Match with 2 alternative expressions.<br/>
  ///             ○ Match a sequence of expressions.<br/>
-   ///                 ○ Match a whitespace character atomically at least once.<br/>
  ///                 ○ Match the string "dopo".<br/>
  ///                 ○ Match a whitespace character atomically at least once.<br/>
  ///                 ○ Match with 3 alternative expressions.<br/>
  ///                         ○ Match a character in the set ['aeo].<br/>
  ///                     ○ Match the string "gli".<br/>
  ///             ○ Match a sequence of expressions.<br/>
-   ///                 ○ Match a whitespace character atomically at least once.<br/>
  ///                 ○ Match the string "più".<br/>
  ///                 ○ Match a whitespace character atomically at least once.<br/>
  ///                 ○ Match the string "tardi".<br/>
                              goto AlternationBranch8;
                          }
                          
+                           // Match a whitespace character atomically at least once.
+                           {
+                               pos += 3;
+                               slice = inputSpan.Slice(pos);
+                               int iteration2 = 0;
+                               while ((uint)iteration2 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration2]))
+                               {
+                                   iteration2++;
+                               }
+                               
+                               if (iteration2 == 0)
+                               {
+                                   goto AlternationBranch8;
+                               }
+                               
+                               slice = slice.Slice(iteration2);
+                               pos += iteration2;
+                           }
+                           
                          // Match with 2 alternative expressions.
                          //{
                              alternation_starting_pos4 = pos;
                              
                              // Branch 0
                              //{
-                                   // Match a whitespace character atomically at least once.
-                                   {
-                                       pos += 3;
-                                       slice = inputSpan.Slice(pos);
-                                       int iteration2 = 0;
-                                       while ((uint)iteration2 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration2]))
-                                       {
-                                           iteration2++;
-                                       }
-                                       
-                                       if (iteration2 == 0)
-                                       {
-                                           goto AlternationBranch9;
-                                       }
-                                       
-                                       slice = slice.Slice(iteration2);
-                                       pos += iteration2;
-                                   }
-                                   
                                  // Match the string "dopo".
                                  if (!slice.StartsWith("dopo"))
                                  {
                              
                              // Branch 1
                              //{
+                                   // Match the string "più".
+                                   if (!slice.StartsWith("più"))
+                                   {
+                                       goto AlternationBranch8;
+                                   }
+                                   
                                  // Match a whitespace character atomically at least once.
                                  {
                                      pos += 3;
                                      pos += iteration4;
                                  }
                                  
-                                   // Match the string "più".
-                                   if (!slice.StartsWith("più"))
+                                   // Match the string "tardi".
+                                   if (!slice.StartsWith("tardi"))
                                  {
                                      goto AlternationBranch8;
                                  }
                                  
                                  // Match a whitespace character atomically at least once.
                                  {
-                                       pos += 3;
+                                       pos += 5;
                                      slice = inputSpan.Slice(pos);
                                      int iteration5 = 0;
                                      while ((uint)iteration5 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration5]))
                                      pos += iteration5;
                                  }
                                  
-                                   // Match the string "tardi".
-                                   if (!slice.StartsWith("tardi"))
-                                   {
-                                       goto AlternationBranch8;
-                                   }
-                                   
-                                   // Match a whitespace character atomically at least once.
-                                   {
-                                       pos += 5;
-                                       slice = inputSpan.Slice(pos);
-                                       int iteration6 = 0;
-                                       while ((uint)iteration6 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration6]))
-                                       {
-                                           iteration6++;
-                                       }
-                                       
-                                       if (iteration6 == 0)
-                                       {
-                                           goto AlternationBranch8;
-                                       }
-                                       
-                                       slice = slice.Slice(iteration6);
-                                       pos += iteration6;
-                                   }
-                                   
                                  // Match 'd'.
                                  if (slice.IsEmpty || slice[0] != 'd')
                                  {
                              
                              // Match a whitespace character atomically at least once.
                              {
-                                   int iteration7 = 0;
-                                   while ((uint)iteration7 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration7]))
+                                   int iteration6 = 0;
+                                   while ((uint)iteration6 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration6]))
                                  {
-                                       iteration7++;
+                                       iteration6++;
                                  }
                                  
-                                   if (iteration7 == 0)
+                                   if (iteration6 == 0)
                                  {
                                      goto LoopIterationNoMatch5;
                                  }
                                  
-                                   slice = slice.Slice(iteration7);
-                                   pos += iteration7;
+                                   slice = slice.Slice(iteration6);
+                                   pos += iteration6;
                              }
                              
                              // Match the string "con".
                              
                              // Match a whitespace character atomically at least once.
                              {
-                                   int iteration8 = 0;
-                                   while ((uint)iteration8 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration8]))
+                                   int iteration7 = 0;
+                                   while ((uint)iteration7 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration7]))
                                  {
-                                       iteration8++;
+                                       iteration7++;
                                  }
                                  
-                                   if (iteration8 == 0)
+                                   if (iteration7 == 0)
                                  {
                                      goto LoopIterationNoMatch6;
                                  }
                                  
-                                   slice = slice.Slice(iteration8);
-                                   pos += iteration8;
+                                   slice = slice.Slice(iteration7);
+                                   pos += iteration7;
                              }
                              
                              // Match with 3 alternative expressions.
                          {
                              pos += 10;
                              slice = inputSpan.Slice(pos);
-                               int iteration9 = 0;
-                               while ((uint)iteration9 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration9]))
+                               int iteration8 = 0;
+                               while ((uint)iteration8 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration8]))
                              {
-                                   iteration9++;
+                                   iteration8++;
                              }
                              
-                               if (iteration9 == 0)
+                               if (iteration8 == 0)
                              {
                                  goto AlternationBranch18;
                              }
                              
-                               slice = slice.Slice(iteration9);
-                               pos += iteration9;
+                               slice = slice.Slice(iteration8);
+                               pos += iteration8;
                          }
                          
                          // Match 'a'.
                          {
                              pos += 4;
                              slice = inputSpan.Slice(pos);
-                               int iteration10 = 0;
-                               while ((uint)iteration10 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration10]))
+                               int iteration9 = 0;
+                               while ((uint)iteration9 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration9]))
                              {
-                                   iteration10++;
+                                   iteration9++;
                              }
                              
-                               if (iteration10 == 0)
+                               if (iteration9 == 0)
                              {
                                  return false; // The input didn't match.
                              }
                              
-                               slice = slice.Slice(iteration10);
-                               pos += iteration10;
+                               slice = slice.Slice(iteration9);
+                               pos += iteration9;
                          }
                          
                          // Match 'a'.
"\\b(1\\s*:\\s*1)|(one (on )?one|one\\s*-\\s* ..." (182 uses)
[GeneratedRegex("\\b(1\\s*:\\s*1)|(one (on )?one|one\\s*-\\s*one|one\\s*:\\s*one)\\b", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
  ///             ○ Match a character in the set [Oo].<br/>
  ///             ○ Match a character in the set [Nn].<br/>
  ///             ○ Match a character in the set [Ee].<br/>
-   ///             ○ Match with 3 alternative expressions.<br/>
+   ///             ○ Match with 2 alternative expressions.<br/>
  ///                 ○ Match a sequence of expressions.<br/>
  ///                     ○ Match ' '.<br/>
  ///                     ○ Optional (greedy).<br/>
  ///                     ○ Match a character in the set [Ee].<br/>
  ///                 ○ Match a sequence of expressions.<br/>
  ///                     ○ Match a whitespace character atomically any number of times.<br/>
-   ///                     ○ Match '-'.<br/>
-   ///                     ○ Match a whitespace character atomically any number of times.<br/>
-   ///                     ○ Match a character in the set [Oo].<br/>
-   ///                     ○ Match a character in the set [Nn].<br/>
-   ///                     ○ Match a character in the set [Ee].<br/>
-   ///                 ○ Match a sequence of expressions.<br/>
-   ///                     ○ Match a whitespace character atomically any number of times.<br/>
-   ///                     ○ Match ':'.<br/>
-   ///                     ○ Match a whitespace character atomically any number of times.<br/>
-   ///                     ○ Match a character in the set [Oo].<br/>
-   ///                     ○ Match a character in the set [Nn].<br/>
-   ///                     ○ Match a character in the set [Ee].<br/>
+   ///                     ○ Match with 2 alternative expressions.<br/>
+   ///                         ○ Match a sequence of expressions.<br/>
+   ///                             ○ Match '-'.<br/>
+   ///                             ○ Match a whitespace character atomically any number of times.<br/>
+   ///                             ○ Match a character in the set [Oo].<br/>
+   ///                             ○ Match a character in the set [Nn].<br/>
+   ///                             ○ Match a character in the set [Ee].<br/>
+   ///                         ○ Match a sequence of expressions.<br/>
+   ///                             ○ Match ':'.<br/>
+   ///                             ○ Match a whitespace character atomically any number of times.<br/>
+   ///                             ○ Match a character in the set [Oo].<br/>
+   ///                             ○ Match a character in the set [Nn].<br/>
+   ///                             ○ Match a character in the set [Ee].<br/>
  ///         ○ Match if at a word boundary.<br/>
  /// </code>
  /// </remarks>
                                      return false; // The input didn't match.
                                  }
                                  
-                                   // Match with 3 alternative expressions.
+                                   // Match with 2 alternative expressions.
                                  //{
                                      alternation_starting_pos1 = pos;
                                      alternation_starting_capturepos1 = base.Crawlpos();
                                              pos += iteration2;
                                          }
                                          
-                                           // Match '-'.
-                                           if (slice.IsEmpty || slice[0] != '-')
-                                           {
-                                               goto AlternationBranch2;
-                                           }
-                                           
-                                           // Match a whitespace character atomically any number of times.
-                                           {
-                                               int iteration3 = 1;
-                                               while ((uint)iteration3 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration3]))
+                                           // Match with 2 alternative expressions.
+                                           //{
+                                               if (slice.IsEmpty)
                                              {
-                                                   iteration3++;
+                                                   UncaptureUntil(0);
+                                                   return false; // The input didn't match.
                                              }
                                              
-                                               slice = slice.Slice(iteration3);
-                                               pos += iteration3;
-                                           }
-                                           
-                                           if ((uint)slice.Length < 3 ||
-                                               !slice.StartsWith("one", StringComparison.OrdinalIgnoreCase)) // Match the string "one" (ordinal case-insensitive)
-                                           {
-                                               goto AlternationBranch2;
-                                           }
+                                               switch (slice[0])
+                                               {
+                                                   case '-':
+                                                       
+                                                       // Match a whitespace character atomically any number of times.
+                                                       {
+                                                           int iteration3 = 1;
+                                                           while ((uint)iteration3 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration3]))
+                                                           {
+                                                               iteration3++;
+                                                           }
+                                                           
+                                                           slice = slice.Slice(iteration3);
+                                                           pos += iteration3;
+                                                       }
+                                                       
+                                                       if ((uint)slice.Length < 3 ||
+                                                           !slice.StartsWith("one", StringComparison.OrdinalIgnoreCase)) // Match the string "one" (ordinal case-insensitive)
+                                                       {
+                                                           UncaptureUntil(0);
+                                                           return false; // The input didn't match.
+                                                       }
+                                                       
+                                                       pos += 3;
+                                                       slice = inputSpan.Slice(pos);
+                                                       break;
+                                                       
+                                                   case ':':
+                                                       
+                                                       // Match a whitespace character atomically any number of times.
+                                                       {
+                                                           int iteration4 = 1;
+                                                           while ((uint)iteration4 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration4]))
+                                                           {
+                                                               iteration4++;
+                                                           }
+                                                           
+                                                           slice = slice.Slice(iteration4);
+                                                           pos += iteration4;
+                                                       }
+                                                       
+                                                       if ((uint)slice.Length < 3 ||
+                                                           !slice.StartsWith("one", StringComparison.OrdinalIgnoreCase)) // Match the string "one" (ordinal case-insensitive)
+                                                       {
+                                                           UncaptureUntil(0);
+                                                           return false; // The input didn't match.
+                                                       }
+                                                       
+                                                       pos += 3;
+                                                       slice = inputSpan.Slice(pos);
+                                                       break;
+                                                       
+                                                   default:
+                                                       UncaptureUntil(0);
+                                                       return false; // The input didn't match.
+                                               }
+                                           //}
                                          
                                          alternation_branch = 1;
-                                           pos += 3;
-                                           slice = inputSpan.Slice(pos);
-                                           goto AlternationMatch1;
-                                           
-                                           AlternationBranch2:
-                                           pos = alternation_starting_pos1;
-                                           slice = inputSpan.Slice(pos);
-                                           UncaptureUntil(alternation_starting_capturepos1);
-                                       //}
-                                       
-                                       // Branch 2
-                                       //{
-                                           // Match a whitespace character atomically any number of times.
-                                           {
-                                               int iteration4 = 3;
-                                               while ((uint)iteration4 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration4]))
-                                               {
-                                                   iteration4++;
-                                               }
-                                               
-                                               slice = slice.Slice(iteration4);
-                                               pos += iteration4;
-                                           }
-                                           
-                                           // Match ':'.
-                                           if (slice.IsEmpty || slice[0] != ':')
-                                           {
-                                               UncaptureUntil(0);
-                                               return false; // The input didn't match.
-                                           }
-                                           
-                                           // Match a whitespace character atomically any number of times.
-                                           {
-                                               int iteration5 = 1;
-                                               while ((uint)iteration5 < (uint)slice.Length && char.IsWhiteSpace(slice[iteration5]))
-                                               {
-                                                   iteration5++;
-                                               }
-                                               
-                                               slice = slice.Slice(iteration5);
-                                               pos += iteration5;
-                                           }
-                                           
-                                           if ((uint)slice.Length < 3 ||
-                                               !slice.StartsWith("one", StringComparison.OrdinalIgnoreCase)) // Match the string "one" (ordinal case-insensitive)
-                                           {
-                                               UncaptureUntil(0);
-                                               return false; // The input didn't match.
-                                           }
-                                           
-                                           alternation_branch = 2;
-                                           pos += 3;
-                                           slice = inputSpan.Slice(pos);
                                          goto AlternationMatch1;
                                      //}
                                      
                                          case 0:
                                              goto LoopIterationNoMatch;
                                          case 1:
-                                               goto AlternationBranch2;
-                                           case 2:
                                              UncaptureUntil(0);
                                              return false; // The input didn't match.
                                      }
"(?<till>zu|bis\\s*zum|zum|bis|bis\\s*hin(\\s ..." (136 uses)
[GeneratedRegex("(?<till>zu|bis\\s*zum|zum|bis|bis\\s*hin(\\s*zum)?|--|-|—|——)", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
  ///             ○ Match a character in the set [Bb].<br/>
  ///             ○ Match a character in the set [Ii].<br/>
  ///             ○ Match a character in the set [Ss].<br/>
-   ///             ○ Match an empty string.<br/>
  ///         ○ Match the string "--".<br/>
  ///         ○ Match a character in the set [\-\u2014].<br/>
  ///         ○ Match the string "——".<br/>
                                  goto AlternationBranch3;
                              }
                              
-                               
                              pos += 3;
                              slice = inputSpan.Slice(pos);
                              goto AlternationMatch;

For more diff examples, see https://gist.github.com/MihuBot/08ae4323ca7f212b9eeeaea972deb8fd

Sample source code for further analysis
const string JsonPath = "RegexResults-1826.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/FJerkLNA");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions