Skip to content

Commit b4254cf

Browse files
committed
Cambios en extractos
Función nueva para obtener exctracto manualmente desde wikicódigo de la página.
1 parent a6a32d5 commit b4254cf

File tree

2 files changed

+110
-4
lines changed

2 files changed

+110
-4
lines changed

Bot.vb

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,112 @@ Namespace WikiBot
601601
End Function
602602

603603

604+
Function GetWikiExtractFromPageNames(ByVal pages As String(), ByVal charLimit As Integer) As SortedList(Of String, String)
605+
Dim tpageres As New SortedList(Of String, String)
606+
For Each page As String In pages
607+
Dim tpage As Page = Getpage(page)
608+
If tpage.Exists Then
609+
Dim textract As WikiExtract = GetWikiExtractFromPage(tpage, charLimit)
610+
tpageres.Add(page, textract.ExtractContent)
611+
End If
612+
Next
613+
Return tpageres
614+
End Function
615+
616+
''' <summary>
617+
''' Obtiene la entradilla de varias páginas manteniendo el wikitexto pero eliminando plantillas y referencias.
618+
''' </summary>
619+
''' <returns></returns>
620+
Function GetWikiExtractFromPages(ByVal pages As String(), ByVal charLimit As Integer) As HashSet(Of WikiExtract)
621+
Dim tlist As New List(Of Page)
622+
For Each page As String In pages
623+
Dim tpage As Page = Getpage(page)
624+
If tpage.Exists Then
625+
tlist.Add(tpage)
626+
End If
627+
Next
628+
Return GetWikiExtractFromPages(tlist.ToArray, charLimit)
629+
End Function
630+
631+
''' <summary>
632+
''' Obtiene la entradilla de varias páginas manteniendo el wikitexto pero eliminando plantillas y referencias.
633+
''' </summary>
634+
''' <returns></returns>
635+
Function GetWikiExtractFromPages(ByVal pages As Page(), ByVal charLimit As Integer) As HashSet(Of WikiExtract)
636+
Dim tset As New HashSet(Of WikiExtract)
637+
For Each page As Page In pages
638+
Dim textract As WikiExtract = GetWikiExtractFromPage(page, charLimit)
639+
If Not textract Is Nothing Then
640+
tset.Add(textract)
641+
End If
642+
Next
643+
Return tset
644+
End Function
645+
646+
''' <summary>
647+
''' Obtiene la entradilla de una página manteniendo el wikitexto pero eliminando plantillas y referencias.
648+
''' </summary>
649+
''' <param name="page"></param>
650+
''' <param name="charLimit"></param>
651+
''' <returns></returns>
652+
Function GetWikiExtractFromPage(ByVal page As Page, ByVal charLimit As Integer) As WikiExtract
653+
If page.Exists Then
654+
Dim pagethreads As String() = page.Threads
655+
Dim TreatedExtract As String = page.Content
656+
For Each thread As String In pagethreads
657+
TreatedExtract = TreatedExtract.Replace(thread, "")
658+
Next
659+
Dim templates As String() = Template.GetTemplateTextArray(TreatedExtract).ToArray
660+
For Each temp As String In templates
661+
If Not (temp.ToUpper.StartsWith("{{IPA|") Or
662+
temp.ToUpper.StartsWith("{{NR|") Or
663+
temp.ToUpper.StartsWith("{{MP|") Or
664+
temp.ToUpper.StartsWith("{{NIHONGO|")) Then 'Mantener pantillas de texto comunes
665+
TreatedExtract = TreatedExtract.Replace(temp, "").Trim()
666+
End If
667+
Next
668+
TreatedExtract = Regex.Replace(TreatedExtract, "(\n\{\|)([\s\S]+?)(\n\|\})", "")
669+
TreatedExtract = Regex.Replace(TreatedExtract, "<[rR]ef ?(|.+)>([\s\S]+?|)<\/[rR]ef>", "")
670+
TreatedExtract = Regex.Replace(TreatedExtract, "(<[Rr]ef.+?)(\/>)", "")
671+
TreatedExtract = Regex.Replace(TreatedExtract, "(\[\[[Cc]ategoría:)(.+?)(\]\])", "")
672+
TreatedExtract = Regex.Replace(TreatedExtract, "\[nota\ [0-9]+\]", "")
673+
TreatedExtract = Utils.RemoveExcessOfSpaces(TreatedExtract)
674+
TreatedExtract = Removefiles(TreatedExtract)
675+
TreatedExtract = TreatedExtract.Trim()
676+
677+
If TreatedExtract.Length > charLimit Then
678+
TreatedExtract = SafeTrimExtract(TreatedExtract.Substring(0, charLimit + 1), charLimit)
679+
End If
680+
'Si el título de la página está en el resumen, coloca en negritas la primera ocurrencia
681+
Dim Extract As New WikiExtract With {
682+
.ExtractContent = TreatedExtract,
683+
.PageName = page.Title}
684+
Return Extract
685+
End If
686+
Return Nothing
687+
End Function
688+
689+
690+
Private Function Removefiles(ByVal str As String) As String
691+
Dim tstr As String = str
692+
Do While True
693+
Dim match As Match = Regex.Match(tstr, "\[\[([Aa]rchivo:|[Ff]ile).+?\]\]")
694+
If Not match.Success Then
695+
Exit Do
696+
End If
697+
Do While True
698+
Dim tmatch As Match = Regex.Match(tstr, "\[\[([Aa]rchivo:|[Ff]ile).+?\]\]")
699+
If (Utils.CountOccurrences(tmatch.Value, "[[") = Utils.CountOccurrences(tmatch.Value, "]]")) Then
700+
tstr = tstr.Replace(tmatch.Value, "")
701+
Exit Do
702+
End If
703+
Dim fixedmatch As String = Utils.ReplaceLast(Utils.ReplaceLast(tmatch.Value, "[[", ""), "]]", "")
704+
tstr = tstr.Replace(tmatch.Value, fixedmatch)
705+
Loop
706+
Loop
707+
Return tstr
708+
End Function
709+
604710
''' <summary>
605711
''' Retorna los resúmenes de las páginas indicadas en el array de entrada como SortedList (con el formato {Página,Resumen}), los nombres de página deben ser distintos.
606712
''' En caso de no existir el la página o el resumen, no lo retorna.

Utils.vb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -508,13 +508,13 @@ Public NotInheritable Class Utils
508508
''' <summary>
509509
''' Cuenta las veces que se repite una cadena de texto en otra cadena de texto.
510510
''' </summary>
511-
''' <param name="StToSerach"></param>
511+
''' <param name="StToSearch"></param>
512512
''' <param name="StToLookFor"></param>
513513
''' <returns></returns>
514-
Public Shared Function CountOccurrences(ByVal StToSerach As String, StToLookFor As String) As Integer
515-
Dim txtlen As Integer = StToSerach.Length
514+
Public Shared Function CountOccurrences(ByVal StToSearch As String, StToLookFor As String) As Integer
515+
Dim txtlen As Integer = StToSearch.Length
516516
Dim strlen As Integer = StToLookFor.Length
517-
Dim newstring As String = StToSerach.Replace(StToLookFor, String.Empty)
517+
Dim newstring As String = StToSearch.Replace(StToLookFor, String.Empty)
518518
Dim newtxtlen As Integer = newstring.Length
519519
Dim lenghtdiff As Integer = txtlen - newtxtlen
520520
Dim occurences As Integer = CInt(lenghtdiff / strlen)

0 commit comments

Comments
 (0)