In [17]:
%reload_ext autoreload
%autoreload 2

from sarpu.data_processing import *
from sarpu.paths_and_names import *

import numpy as np
import pandas as pd
import requests

import sklearn.model_selection
import sklearn.datasets

# Input Data

In [18]:
# Names and locations
data_folder = "../../Data/"
data_name = "20ng"

In [19]:
# Creation information
nb_splits = 5
test_size = 0.2

# Prepare folders

In [20]:
# Prepare folders
data_folder_original = original_data_path(data_folder,data_name)
!mkdir -p $data_folder_original
data_folder_processed = processed_data_path(data_folder,data_name)
!mkdir -p $data_folder_processed
data_folder_partitions = partitions_data_path(data_folder,data_name)
!mkdir -p $data_folder_partitions

# Download

In [21]:
from sklearn.datasets import fetch_20newsgroups

pos="comp"
neg="rec"
categories = [
#  'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
#  'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
#  'sci.crypt',
#  'sci.electronics',
#  'sci.med',
#  'sci.space',
#  'soc.religion.christian',
#  'talk.politics.guns',
#  'talk.politics.mideast',
#  'talk.politics.misc',
#  'talk.religion.misc'
]

original_data = fetch_20newsgroups(
    data_home=data_folder_original,
    subset='all',
    categories=categories,
    remove=('headers','footers','quotes'),
    download_if_missing=True
)

In [22]:
# Make tfidf dataset
from sklearn.feature_extraction.text import CountVectorizer
n_words=200
vectorizer = CountVectorizer(
    max_features=n_words, 
    binary=True,
    analyzer="word", 
    stop_words="english",
    strip_accents ="ascii",
    token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b' #This token ignores words with numbers and requires words to have lenght>=2
)

vectors = vectorizer.fit_transform(original_data.data)


In [23]:
instances = vectors.toarray()
classes = np.asarray(list(map(lambda name: 1 if pos in name else 0 if neg in name else np.NaN, original_data.filenames))).reshape(-1,1)
print(instances.shape, classes.shape)

data = np.concatenate([instances,classes], axis=1)

df = pd.DataFrame(data, columns=(vectorizer.get_feature_names()+["class"])).dropna()
df

(8870, 200) (8870, 1)


Unnamed: 0,able,access,actually,add,address,advance,ago,anybody,apple,application,...,won,work,working,works,world,wrong,year,years,yes,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
7,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
9,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [24]:
#class distribution

df["class"].value_counts()


1    4891
0    3979
Name: class, dtype: int64

In [25]:
#normalize
for column in df.columns.values:
    df[column]=pd.to_numeric(df[column])

normalized_df=(df.astype(float)-df.min())/(df.max()-df.min())*2-1
normalized_df["class"] = df["class"]
df = normalized_df

df.head()

Unnamed: 0,able,access,actually,add,address,advance,ago,anybody,apple,application,...,won,work,working,works,world,wrong,year,years,yes,class
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1


In [26]:
#move class to back

cols = list(df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('class')) #Remove class from list
df = df[cols+['class']]

df.head()

Unnamed: 0,able,access,actually,add,address,advance,ago,anybody,apple,application,...,won,work,working,works,world,wrong,year,years,yes,class
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1


In [27]:
# make numpy array

import numpy as np

xy = df.values

x = xy[:,:-1].astype(float)
y = xy[:,-1].astype(int)

x_pos = x[y==1]
x_neg = x[y==0]

In [28]:
#Save data and true classes
np.savetxt(data_path(data_folder, data_name), x)
np.savetxt(classlabels_path(data_folder, data_name), y,fmt='%d')

# Different dataset partitions (train/test and class prior)

In [29]:
sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits=nb_splits, test_size=test_size, random_state=0)
splits = list(sss.split(x,y))

In [30]:
#save partitions. 0 means not in data, 1 means in train partition, 2 means in test partition

for i, (train,test) in enumerate(splits):
    partition = np.zeros_like(y,dtype=int)
    partition[train]=1
    
    partition[test]=2    
    np.savetxt(partition_path(data_folder,data_name, i), partition, fmt='%d')
   