# 20Newsgroup

## Comp 5 dataset

In [8]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

data = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'footers', 'quotes'), categories=['comp.graphics', 
        'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x'])
X, y = data.data, data.target
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))

vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8, stop_words='english')
vectorizer = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
])
X = vectorizer.fit_transform(X)
msq = y == 2
y[msq] = 1
y[~msq] = 0
np.unique(y, return_counts=True), X.shape

4891 documents
5 categories


((array([0, 1]), array([3909,  982])), (4891, 13446))

In [None]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [49]:
for i in range(20):
    print(np.unique(y[ids_train[i][:100]], return_counts=True))

(array([0, 1]), array([88, 12]))
(array([0, 1]), array([85, 15]))
(array([0, 1]), array([75, 25]))
(array([0, 1]), array([79, 21]))
(array([0, 1]), array([80, 20]))
(array([0, 1]), array([84, 16]))
(array([0, 1]), array([80, 20]))
(array([0, 1]), array([84, 16]))
(array([0, 1]), array([88, 12]))
(array([0, 1]), array([85, 15]))
(array([0, 1]), array([81, 19]))
(array([0, 1]), array([80, 20]))
(array([0, 1]), array([83, 17]))
(array([0, 1]), array([80, 20]))
(array([0, 1]), array([83, 17]))
(array([0, 1]), array([84, 16]))
(array([0, 1]), array([78, 22]))
(array([0, 1]), array([77, 23]))
(array([0, 1]), array([84, 16]))
(array([0, 1]), array([88, 12]))


In [50]:
np.savez_compressed('splits/trsplitcomp5', ids_train)
np.savez_compressed('splits/tstsplitcomp5', ids_test)

## baseball-hockey

In [15]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

data = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'footers', 'quotes'), categories=['rec.sport.baseball', 'rec.sport.hockey'])
X, y = data.data, data.target
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))

vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8, stop_words='english')
vectorizer = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
])
X = vectorizer.fit_transform(X)
np.unique(y, return_counts=True), X.shape

1993 documents
2 categories


((array([0, 1]), array([994, 999])), (1993, 5724))

In [60]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [61]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([5, 5]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([2, 8]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([5, 5]))


In [62]:
np.savez_compressed('splits/trsplitbase_hock', ids_train)
np.savez_compressed('splits/tstsplitbase_hock', ids_test)

## pc-mac

In [23]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

data = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'footers', 'quotes'), categories=['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware'])
X, y = data.data, data.target
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))

vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8, stop_words='english')
vectorizer = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
])
X = vectorizer.fit_transform(X)
np.unique(y, return_counts=True), X.shape

1945 documents
2 categories


((array([0, 1]), array([982, 963])), (1945, 3868))

In [68]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [69]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([6, 4]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([2, 8]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([7, 3]))


In [70]:
np.savez_compressed('splits/trsplitpc_mac', ids_train)
np.savez_compressed('splits/tstsplitpc_mac', ids_test)

## religion-atheism

In [26]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

data = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'footers', 'quotes'), categories=['soc.religion.christian', 'talk.religion.misc', 'alt.atheism'])
X, y = data.data, data.target
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))

vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8, stop_words='english')
vectorizer = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
])
X = vectorizer.fit_transform(X)
msq = y == 2
y[msq] = 1
y[~msq] = 0
np.unique(y, return_counts=True), X.shape

2424 documents
3 categories


((array([0, 1]), array([1796,  628])), (2424, 7829))

In [74]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [75]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([8, 2]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([6, 4]))


In [76]:
np.savez_compressed('splits/trsplitrel_ath', ids_train)
np.savez_compressed('splits/tstsplitrel_ath', ids_test)

# Digits dataset

## one_two

In [29]:
from sklearn.datasets import load_digits
import numpy as np

data = load_digits()
X, y = data.data, data.target
msq = np.logical_or(y == 1, y == 2)
y = y[msq]
y[y == 1] = 1
y[y == 2] = 0
X = X[msq]
np.unique(y, return_counts=True), X.shape

((array([0, 1]), array([177, 182])), (359, 64))

In [30]:
id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
len(id_train), len(id_test)

(251, 108)

In [3]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [4]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([8, 2]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([8, 2]))


In [5]:
np.savez_compressed('splits/trsplitone_two', ids_train)
np.savez_compressed('splits/tstsplitone_two', ids_test)

## odd_even

In [91]:
from sklearn.datasets import load_digits

data = load_digits()
X, y = data.data, data.target
msq = np.logical_or(np.logical_or(np.logical_or(np.logical_or(y == 0, y == 2), y == 4), y == 6), y == 8)
y[msq] = 1
y[~msq] = 0
np.unique(y, return_counts=True)

(array([0, 1]), array([906, 891]))

In [92]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [93]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([5, 5]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([4, 6]))


In [94]:
np.savez_compressed('splits/trsplitodd_even', ids_train)
np.savez_compressed('splits/tstsplitodd_even', ids_test)