# 20 newsgroups - dataset analysis

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import plotly.graph_objects as go
from nltk.tokenize import word_tokenize

In [2]:
# load the dataset
newsgroups_train = fetch_20newsgroups(subset='all')
newsgroups_train_noq = fetch_20newsgroups(subset='all', remove=('quotes',))

In [3]:
newsgroups_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
len(newsgroups_train['target'])

18846

In [5]:
newsgroups_train['target_names']

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
cats, counts = np.unique(newsgroups_train['target'], return_counts=True)

In [7]:
counts

array([799, 973, 985, 982, 963, 988, 975, 990, 996, 994, 999, 991, 984,
       990, 987, 997, 910, 940, 775, 628], dtype=int64)

In [8]:
fig = go.Figure(data=[go.Bar(x=newsgroups_train['target_names'][:20], y=counts[:20], marker_color='gray')])

# Update layout
fig.update_layout(
    title='Liczność klas w zbiorze 20 newsgroups',
    xaxis_title='Klasa',
    yaxis_title='Liczba przykładów',
        font=dict(
        family="Arial, sans-serif",
        size=16,
        color="black"
    ),
    plot_bgcolor='white',  # Set background color to white
    paper_bgcolor='white',  # Set paper color to white
    xaxis=dict(linecolor='black', linewidth=2),  # Set x-axis line color to black
    yaxis=dict(linecolor='black', linewidth=1, showgrid=True, gridcolor='black', gridwidth=1)
)

# Show plot
fig.show()

In [9]:
# Number of all characters
print(f"With quotes: {sum(map(len, newsgroups_train['data']))}")
print(f"No quotes: {sum(map(len, newsgroups_train_noq['data']))}")

With quotes: 35855003
No quotes: 28333600


In [10]:
# Average number of characters in a sentence 
print(f"With quotes: {np.mean(list(map(len, newsgroups_train['data'])))}")
print(f"No quotes: {np.mean(list(map(len, newsgroups_train_noq['data'])))}")

With quotes: 1902.5258940889314
No quotes: 1503.4277830839435


In [11]:
all_tokens = 0
token_lens = []

for text in newsgroups_train['data']:
    tokens = word_tokenize(text.lower())
    all_tokens += len(tokens)
    token_lens.append(len(tokens))
all_tokens

7644249

In [12]:
all_tokens_noq = 0
token_lens_noq = []

for text in newsgroups_train_noq['data']:
    tokens = word_tokenize(text.lower())
    all_tokens_noq += len(tokens)
    token_lens_noq.append(len(tokens))
all_tokens_noq

6014264

In [13]:
print(f"With quotes: {np.mean(token_lens)}")
print(f"No quotes: {np.mean(token_lens_noq)}")

With quotes: 405.616523400191
No quotes: 319.1268173617744


In [14]:
# Min number of tokens
print(f"With quotes: {min(token_lens)}")
print(f"No quotes: {min(token_lens_noq)}")

With quotes: 17
No quotes: 17


In [15]:
# Max number of tokens
print(f"With quotes: {max(token_lens)}")
print(f"No quotes: {max(token_lens_noq)}")

With quotes: 79266
No quotes: 78909


In [16]:
token_lens.sort()
token_lens_noq.sort()

In [17]:
token_lens[-15:]

[28477,
 28659,
 29116,
 29519,
 29530,
 29653,
 29870,
 29902,
 29997,
 30214,
 30417,
 30670,
 34300,
 35987,
 79266]

In [18]:
# Create histogram
fig = go.Figure(data=[go.Histogram(x=token_lens[:18000])])

# Update layout
fig.update_layout(
    title='Histogram długości 18000 najkrótszych tekstów w zbiorze 20 newsgroups (z cytatami)',
    xaxis_title='Długość tekstu (liczba tokenów)',
    yaxis_title='Liczność',
        font=dict(
        family="Arial, sans-serif",
        size=16,
        color="black"
    ),
    xaxis_range=[0, 1100],
    yaxis_range=[0, 1000],
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(linecolor='black', linewidth=1),
    yaxis=dict(linecolor='black', linewidth=1, showgrid=True, gridcolor='black', gridwidth=1)
)

# Show plot
fig.show()

In [35]:
# Create histogram
fig = go.Figure(data=[go.Histogram(x=token_lens_noq[:18000], marker_color='green')])

# Update layout
fig.update_layout(
    title='Histogram długości 18000 najkrótszych tekstów w zbiorze 20 newsgroups (bez cytatów)',
    xaxis_title='Długość tekstu (liczba tokenów)',
    yaxis_title='Liczność',
        font=dict(
        family="Arial, sans-serif",
        size=16,
        color="black"
    ),
    xaxis_range=[0, 1100],
    yaxis_range=[0, 1000],
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(linecolor='black', linewidth=1),
    yaxis=dict(linecolor='black', linewidth=1, showgrid=True, gridcolor='black', gridwidth=1)
)

# Show plot
fig.show()

In [20]:
# # Boxplot
# fig = go.Figure(data=[go.Box(x=token_lens[:-100])])

# # Update layout
# fig.update_layout(
#     title='Boxplot',
#     xaxis_title='Długość tekstu (liczba tokenów)',
#         font=dict(
#         family="Arial, sans-serif",
#         size=16,
#         color="black"
#     ),
# )

# # Show plot
# fig.show()

In [37]:
# Sample data
data1 = np.random.randn(100)
data2 = np.random.randn(100)

# Create box plot
fig = go.Figure()

# Add first box plot
fig.add_trace(go.Box(x=token_lens[:-300], name='Z cytatami'))

# Add second box plot
fig.add_trace(go.Box(x=token_lens_noq[:-300], name='Bez cytatów', marker_color='green'))

# Update layout
fig.update_layout(
    title='Wykresy pudełkowe długości sentencji zbioru 20 newsgroups',
    xaxis_title='Długość (w tokenach)',
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(linecolor='black', linewidth=1),
    yaxis=dict(linecolor='black', linewidth=1, showgrid=True, gridcolor='black', gridwidth=1),
        font=dict(
        family="Arial, sans-serif",
        size=16,
        color="black"
    ),
)

# Show plot
fig.show()

### 20newsgroups to TF-IDF and LSA 
(save to files)

In [27]:
# from experiments.preprocess import TextPrep
# from torch import save
# from tqdm import tqdm

In [28]:
# cats = ['comp.graphics', 'sci.med', 'rec.sport.hockey']
# newsgroups_train = fetch_20newsgroups(subset='all', categories=cats, remove=('headers', 'footers', 'quotes'))

In [29]:
# for i in tqdm([500, 1000, 2000, 5000]):
#     preprocessor = TextPrep(svd_components=i, max_features=None)
#     datafile = f"data/data_lsa_03-cats_{i}-comp.pt"
#     targetfile = f"data/target_lsa_03-cats_{i}-comp.pt"

#     lsa_data_x, lsa_data_y = preprocessor.preprocess_dataset(newsgroups_train, lsa=True, spikes=False)
#     save(lsa_data_x, datafile)
#     save(lsa_data_y, targetfile)

In [30]:
# newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [31]:
# for i in tqdm([500, 1000, 2000, 5000, 8000]):
#     preprocessor = TextPrep(svd_components=i, max_features=None)
#     datafile = f"data/data_lsa_20-cats_{i}-comp.pt"
#     targetfile = f"data/target_lsa_20-cats_{i}-comp.pt"

#     lsa_data_x, lsa_data_y = preprocessor.preprocess_dataset(newsgroups_train, lsa=True, spikes=False)
#     save(lsa_data_x, datafile)
#     save(lsa_data_y, targetfile)