# Welche Textsorten stützen sich besonders auf Diskursmarker?

In [7]:
import ipywidgets as widgets
from IPython.display import display

from plots import CreatePlots as cp
from helpers import Helpers as hp
from datasets import CorpusData as cd
from datasets import GenreData as gd
from datasets import DiscourseTypeData as dtd

## Diskursarten

In [4]:
data = cd.CorpusData("../../bigData/listenability-tools/datasets/scores/spotify-scores_short.csv",
                     "../../bigData/listenability-tools/datasets/dict/spotify-dict.json",
                     "../../bigData/listenability-tools/datasets/scores/ted-scores_short.csv",
                     "../../bigData/listenability-tools/datasets/dict/ted-dict.json",
                     "../../bigData/listenability-tools/datasets/scores/nytimes-scores_short.csv",
                     "../../bigData/listenability-tools/datasets/dict/nytimes-dict.json",
                     "../../bigData/listenability-tools/datasets/scores/gigaword-scores_short.csv",
                     "../../bigData/listenability-tools/datasets/dict/gigaword-dict.json")

### Percent of DM per Text

In [13]:
%matplotlib widget
dm_count_perc = [data.spotify.get_percent_dm_count_statistics(),
                 data.ted.get_percent_dm_count_statistics(),
                 data.ny.get_percent_dm_count_statistics(),
                 data.gig.get_percent_dm_count_statistics()]

cp.plot_vertical_barchart("Percent Discourse Markers per Text",
                          dm_count_perc,
                          ["Min", "Mean", "Mode", "Max"],
                          "Marker Occurrences in %",
                          label_1="Spotify", label_2="TED", label_3="NYTimes", label_4="Gigaword",
                          color_1=data.spotify_color, color_2=data.ted_color,
                          color_3=data.ny_color, color_4=data.gig_color)

hp.show_dataframe("Percent Discourse Markers per Text",
                  ['Min', 'Mean', 'Mode', 'Max'],
                  dm_count_perc[0], data2=dm_count_perc[1], data3=dm_count_perc[2],
                  data4=dm_count_perc[3],
                  label1="Spotify", label2="TED",
                  label3="NYTimes", label4="Gigaword")

hp.effectsize_and_significance("Percent Discourse Markers per Text",
                               [dm_count_perc[0] + dm_count_perc[1], dm_count_perc[2] + dm_count_perc[3]],
                               ["Spoken", "Written"])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Percent Discourse Markers per Text
               Min      Mean      Mode         Max
names                                             
Spotify   3.200000  8.028662  8.000000   13.071895
TED       4.663212  8.559530  9.090909   17.985612
NYTimes   0.000000  5.675527  0.000000   33.333333
Gigaword  0.000000  5.428603  0.000000  100.000000
Percent Discourse Markers per Text
                 Effectsize  T-Statistic   P-Value
Data                                              
Spoken, Written    -0.35987    -0.719739  0.494234


Der p-Wert liegt bei fast 50%, was zeigt, dass die Ergebnisse nicht statistisch signifikant sind - es besteht also kein Unterschied in der Menge der Nutzung von DM zwischen den verschiedenen Diskurstypen.

### Empirical Distribution Function

In [12]:
%matplotlib widget
dm_percents = [data.spotify.get_percent_dm_per_text(),
                   data.ted.get_percent_dm_per_text(),
                   data.ny.get_percent_dm_per_text(),
                   data.gig.get_percent_dm_per_text()]

cp.plot_ecdf(dm_percents,
                 "ECDF for % of Discourse Markers per Text", "% DM per Text", "ECDF (% of Texts)",
                 ["Spotify", "TED", "New York Times", "Gigaword"],
                 [data.spotify_color, data.ted_color, data.ny_color,
                  data.gig_color])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Genres

### Percent of DM per Text

## Konversationstypen

### Percent of DM per Text