In [1]:
# Convert the csv to many parquet files, each containing 10000 rows
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

csv_file_path = 'cleaned_file.csv'
parquet_file_prefix = 'cleaned_file.parquet/file_'
parquet_file_suffix = '.parquet'

chunk_size = 10000  # number of rows per chunk

# Iterate over the CSV file in chunks and write each chunk to a separate Parquet file
for i, chunk in enumerate(pd.read_csv(csv_file_path, chunksize=chunk_size)):
    # Create the filename for the current chunk
    ident = str(i).zfill(4)
    parquet_file_path = parquet_file_prefix + ident + parquet_file_suffix

    # Write the current chunk to a Parquet file
    table = pa.Table.from_pandas(chunk)
    pq.write_table(table, parquet_file_path)
    print("\rchunk", ident, end='')

chunk 0852

In [None]:
def wikileaks_problem(colnum, path='cleaned_file.parquet/'):
    import pyarrow.parquet as pq
    from os import listdir
    from os.path import isfile, join
    
    text = 'Tor\n\nTor is an encrypted anonymising network that makes it harder to intercept '
    text += 'internet communications, or see where communications are coming from or going '
    text += 'to.\n\nIn order to use the WikiLeaks public submission system as detailed above '
    text += 'you can download the Tor Browser Bundle, which is a Firefox-like browser available '
    text += 'for Windows, Mac OS X and GNU/Linux and pre-configured to connect using the anonymising '
    text += 'system Tor.\n\nTails\n\nIf you are at high risk and you have the capacity to do so, '
    text += 'you can also access the submission system through a secure operating system called '
    text += 'Tails. Tails is an operating system launched from a USB stick or a DVD that aim to '
    text += 'leaves no traces when the computer is shut down after use and automatically routes '
    text += 'your internet traffic through Tor. Tails will require you to have either a USB stick '
    text += 'or a DVD at least 4GB big and a laptop or desktop computer.'
    
    
    onlyparquets = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.parquet')]

    
    column_name = 'domain'
    search_value = 'wikileaks.org'
    matches = articles = 0
    
    filter_condition = (column_name, '==', search_value)
    
    for file in onlyparquets:
        print(('checking '+file).ljust(27), f'{matches}/{articles} = ', round(100*matches/max(articles,1),2),'%','        \r', end='')
        column_names = parquet_file.schema.names
        table = pq.read_table(path+file, filters=[filter_condition])
        for news in table[colnum]:
            if str(news) == text:
                matches += 1
            articles += 1
    return (matches,articles)

In [None]:
articles = wikileaks_problem(1, path='joshdata/')

output: `checking chunk_684.parquet  160880/199030 =  80.83 %`

Get amounts of labels:

In [18]:
def count_labels(path='cleaned_file.parquet/'):
    import pyarrow.parquet as pq
    from os import listdir
    from os.path import isfile, join
    
    onlyparquets = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.parquet')]

    
    column_name = 'label'
    #search_value = 'wikileaks.org'
    labelcount = {}
    n = 0
    #filter_condition = (column_name, '==', search_value)
    
    for file in onlyparquets:
        print('checking '+file,'\r', end='')
        table = pq.read_table(path+file)
        for row in table['type']:
            this_type = row.as_py()
            if not this_type:
                this_type = 'empty'
            if this_type in labelcount:
                labelcount[this_type] += 1
            else:
                labelcount[this_type] = 1
        n += 1
    print()
    return labelcount
        

In [19]:
labels = count_labels()
for key, value in labels.items():
    print("{:<12}: {:>10}".format(key, value),)

checking file_0509.parquet 
unknown     :     371518
bias        :    1138998
fake        :     894746
political   :    1657224
empty       :     403211
rumor       :     481158
conspiracy  :     831235
clickbait   :     231949
reliable    :    1913222
satire      :     112948
unreliable  :     298784
junksci     :     117467
hate        :      76496
