**This is the simplified Opensantions data in csv format**

In [1]:
#import packages
import requests
import pandas as pd
import numpy as np
import datetime
import os
import re

# Retrieval of the data

In [2]:
#test
url = "https://data.opensanctions.org/datasets/20231224/peps/targets.simple.csv"
response = requests.get(url)
print(response.status_code)

200


In [3]:
#create urls for all relevant dates
date_list = pd.date_range(start='20230101',end='20231231',freq='D').strftime('%Y%m%d')
date_list

Index(['20220101', '20220102', '20220103', '20220104', '20220105', '20220106',
       '20220107', '20220108', '20220109', '20220110',
       ...
       '20221222', '20221223', '20221224', '20221225', '20221226', '20221227',
       '20221228', '20221229', '20221230', '20221231'],
      dtype='object', length=365)

In [4]:
#get all the urls for all the dates we need the data for
#make each day an individual list so we can compare and match them
websites = []

for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/peps/targets.simple.csv'
    websites.append(test)
print(websites)

['https://data.opensanctions.org/datasets/20220101/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220102/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220103/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220104/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220105/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220106/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220107/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220108/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220109/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220110/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220111/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220112/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20220113/peps/targets.simple.csv', 'https://da

#first check for missing days
for site in websites:
    try:
        response = requests.get(site)
        response.raise_for_status()  # Raises an HTTPError for bad response
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {site}: {e}")
        continue

In [5]:
#now loop over the batches to get all the data and append the dataframes to one another
#this gives us a list of dataframes
#the loop must contain a date identifyer as a new column so each dataframe is marked with the retrieval date that is then needed to match the different days against each other
peps_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = pd.read_csv(site, low_memory=False)
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.group(1)
        data['date_stamp'] = date #set an individual date stamp for each dataframe matching the url date
        #ata =  data.set_index('date_stamp') #set as an index
    peps_list.append(data)
res = pd.concat(peps_list)  # concatenate list of dataframes

In [6]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp
0,NK-232XviNCU8o2PiPCQC3Cg2,Person,AMASSOUMOU NDIFOANE,,1951-10-24,cm,,,,,,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101
1,NK-285BXbQUroosx6WdAP44YD,Person,ZONDOL HERSSESSE,,1967-01-01,cm,,,,,Zondol@assnat.cm,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101
2,NK-288EBmCdPXppFYtfmcgyqg,Person,Fethiye Özver,,,tr,,,,,,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101
3,NK-2AaUhJdZeBxnxVKVF7fTVv,Person,Marica Montemaggi,"Montemaggi, Marica",1982-07-20,sm,,,,,,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101
4,NK-2Nhqi8R9tL2Hz9P9FpsR2W,Person,BANLOG Polycarpe,Polycarpe BANLOG,1957-06-17,cm,,,,,polycarpebanlog@assnat.cm,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221089,us-cia-zimbabwe-kembo-mohadi-vice-pres,Person,Kembo MOHADI,,,zw,,,,,,CIA World Leaders,2022-02-01 01:29:40,2021-07-26 11:55:45,20221231
221090,us-cia-zimbabwe-kirsty-coventry-min-of-youth-s...,Person,Kirsty COVENTRY,,,zw,,,,,,CIA World Leaders,2022-02-01 01:29:40,2021-07-26 11:55:45,20221231
221091,us-cia-zimbabwe-mthuli-ncube-min-of-finance-ec...,Person,Mthuli NCUBE,,,zw,,,,,,CIA World Leaders,2022-02-01 01:29:40,2021-07-26 11:55:45,20221231
221092,us-cia-zimbabwe-sekai-nzenza-min-of-industry-c...,Person,Sekai NZENZA,,,zw,,,,,,CIA World Leaders,2022-02-01 01:29:40,2021-07-26 11:55:45,20221231


# alternatively, get the names as txt file

import requests
import re


websites = []

# Construct list of URLs based on date_list
for date in date_list:
    test = f'https://data.opensanctions.org/datasets/{date}/peps/names.txt'
    websites.append(test)

peps_data = []  # List to store the text data and date identifier

# Regex pattern to extract date from the URL
date_pattern = r'/datasets/(\d{8})/'

# Iterate over each website URL
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    
    data = response.text
    
    # Extract date from the URL
    match = re.search(date_pattern, site)
    if match:
        date_identifier = match.group(1)  # Extract the date from the URL
        peps_data.append({'date': date_identifier, 'text': data})  # Append data along with the date identifier

print(peps_data)  # Output the list of text data along with the corresponding date identifier


#url_txt = "https://data.opensanctions.org/datasets/20240411/peps/names.txt"
websites = []
data = []
date = []
for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/peps/names.txt'
    websites.append(test)
data = []
peps_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = response.text
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.Group(1)
        date.append(data)
        #data_full = dict(zip(date,data)) #set an individual date stamp for each dataframe matching the url date
        #data =  data.set_index('date_stamp') #set as an index
    peps_list.append(data)
peps_list

def compress_file(input_file, output_zip):
    with zipfile.ZipFile(output_zip, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(input_file)

# Example usage:
input_file = 'peps_list.txt'
output_zip = 'peps_text.zip'
compress_file(input_file, output_zip)


# Match the dates against each other

In [7]:
#now match the different dates against each other and return an indicator per row for a new listing or delisting
# Sort the DataFrame based on date_stamp and id
res.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res.duplicated(subset=['id'], keep='first')
deletions = ~res.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Set the values of new_entry, deletion, and unchanged columns
res['new_entry'] = new_entries
res['deletion'] = deletions
res['unchanged'] = unchanged_rows

# Reset index for the final result
res.reset_index(drop=True, inplace=True)

In [8]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged
0,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-19 01:27:13,20220119,True,False,False
1,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-20 01:33:01,20220120,False,False,True
2,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-21 01:28:08,20220121,False,False,True
3,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-22 01:26:45,20220122,False,False,True
4,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-23 01:28:03,20220123,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55493126,us-cia-zimbabwe-winston-chitando-min-of-mines-...,Person,Winston CHITANDO,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-15 08:55:59,20220115,False,False,True
55493127,us-cia-zimbabwe-winston-chitando-min-of-mines-...,Person,Winston CHITANDO,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-17 07:40:14,20220117,False,False,True
55493128,us-cia-zimbabwe-winston-chitando-min-of-mines-...,Person,Winston CHITANDO,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-18 01:30:55,20220118,False,False,True
55493129,us-cia-zimbabwe-winston-chitando-min-of-mines-...,Person,Winston CHITANDO,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-19 01:27:13,20220119,False,False,True


In [9]:
#test thematching function
test = res.query('new_entry == True')
test

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged
0,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-19 01:27:13,20220119,True,False,False
347,NK-232XviNCU8o2PiPCQC3Cg2,Person,AMASSOUMOU NDIFOANE,,1951-10-24,cm,,,,,,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False
440,NK-24isa6tapAYCqtEnqbUdzv,Person,DEMIDOVICH VASILIJ,Demidovich Vasili Nikolaevich;Demidovich Vasil...,1961-12-24,by,,,,,demidovich@house.gov.by,Every Politician;RuPEP Public Database of PEPs...,2022-06-22 18:15:31,2021-07-26 11:55:45,20220622,True,False,False
633,NK-25Z3H4pijwnkYMmpybVYzQ,Person,Мария Владимировна Пирогова,Mariya Vladimirovna Pirogova;Pirogova Mariia V...,1993-05-13,ru;ua,,3410105889,,,,RuPEP Public Database of PEPs in Russia and Be...,2022-11-13 18:15:42,2022-11-12 06:20:00,20221113,True,False,False
665,NK-26h6Ukv23GDmJUyhgmK5ML,Person,Aleksandr Aleksandrovich Zhuravlev,Александр Александрович Журавлёв,,,,,,,,RuPEP Public Database of PEPs in Russia and Be...,2022-05-21 18:14:04,2022-05-21 06:15:45,20220521,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55492105,us-cia-zimbabwe-obadiah-moyo-min-of-health-chi...,Person,Obadiah MOYO,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False
55492186,us-cia-zimbabwe-oppah-muchinguri-kashiri-min-o...,Person,Oppah MUCHINGURI-KASHIRI,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False
55492384,us-cia-zimbabwe-sekai-nzenza-min-of-industry-c...,Person,Sekai NZENZA,,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False
55492748,us-cia-zimbabwe-sibusiso-moyo-lt-gen-ret-min-o...,Person,"Sibusiso MOYO, Lt. Gen. (Ret.)",,,zw,,,,,,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False


In [10]:
# Add a 'month' column
res['month'] = pd.to_datetime(res['date_stamp']).dt.to_period('M')
#get a count of all listings per month
#res['listing_count'] = 0  # Initialize the 'listing_count' column with zeros
#res['listing_count'] = res.groupby('month')['id'].transform('nunique')
#res

# Deduplicate

In [11]:
res_deduplicated = res.copy()
res_deduplicated.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res_deduplicated.duplicated(subset=['id'], keep='first')
deletions = ~res_deduplicated.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Create new columns for first seen and last seen dates
res_deduplicated['listing_date'] = res_deduplicated['date_stamp'].where(new_entries)
res_deduplicated['delisting_date'] = res_deduplicated['date_stamp'].where(deletions)

# Deduplicate the entries (keep the first occurrence for each entity)
res_deduplicated = res_deduplicated.drop_duplicates(subset=['id'], keep='first')

# Reset index for the final result
res_deduplicated.reset_index(drop=True, inplace=True)

# Print or further analyze the deduplicated DataFrame 'res_deduplicated'
print(res_deduplicated)

                                                       id  schema  \
0                               NK-22qMyyFP4bD3QhZ7BAFXs6  Person   
1                               NK-232XviNCU8o2PiPCQC3Cg2  Person   
2                               NK-24isa6tapAYCqtEnqbUdzv  Person   
3                               NK-25Z3H4pijwnkYMmpybVYzQ  Person   
4                               NK-26h6Ukv23GDmJUyhgmK5ML  Person   
...                                                   ...     ...   
237243  us-cia-zimbabwe-obadiah-moyo-min-of-health-chi...  Person   
237244  us-cia-zimbabwe-oppah-muchinguri-kashiri-min-o...  Person   
237245  us-cia-zimbabwe-sekai-nzenza-min-of-industry-c...  Person   
237246  us-cia-zimbabwe-sibusiso-moyo-lt-gen-ret-min-o...  Person   
237247  us-cia-zimbabwe-winston-chitando-min-of-mines-...  Person   

                                      name  \
0                        SAKAMOTO Tetsushi   
1                      AMASSOUMOU NDIFOANE   
2                       DEMIDOVIC

In [12]:
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,dataset,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged,month,listing_date,delisting_date
0,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,...,CIA World Leaders,2021-07-26 11:55:45,2022-01-19 01:27:13,20220119,True,False,False,2022-01,20220119,
1,NK-232XviNCU8o2PiPCQC3Cg2,Person,AMASSOUMOU NDIFOANE,,1951-10-24,cm,,,,,...,Every Politician,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,
2,NK-24isa6tapAYCqtEnqbUdzv,Person,DEMIDOVICH VASILIJ,Demidovich Vasili Nikolaevich;Demidovich Vasil...,1961-12-24,by,,,,,...,Every Politician;RuPEP Public Database of PEPs...,2022-06-22 18:15:31,2021-07-26 11:55:45,20220622,True,False,False,2022-06,20220622,
3,NK-25Z3H4pijwnkYMmpybVYzQ,Person,Мария Владимировна Пирогова,Mariya Vladimirovna Pirogova;Pirogova Mariia V...,1993-05-13,ru;ua,,3410105889,,,...,RuPEP Public Database of PEPs in Russia and Be...,2022-11-13 18:15:42,2022-11-12 06:20:00,20221113,True,False,False,2022-11,20221113,
4,NK-26h6Ukv23GDmJUyhgmK5ML,Person,Aleksandr Aleksandrovich Zhuravlev,Александр Александрович Журавлёв,,,,,,,...,RuPEP Public Database of PEPs in Russia and Be...,2022-05-21 18:14:04,2022-05-21 06:15:45,20220521,True,False,False,2022-05,20220521,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237243,us-cia-zimbabwe-obadiah-moyo-min-of-health-chi...,Person,Obadiah MOYO,,,zw,,,,,...,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,
237244,us-cia-zimbabwe-oppah-muchinguri-kashiri-min-o...,Person,Oppah MUCHINGURI-KASHIRI,,,zw,,,,,...,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,
237245,us-cia-zimbabwe-sekai-nzenza-min-of-industry-c...,Person,Sekai NZENZA,,,zw,,,,,...,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,
237246,us-cia-zimbabwe-sibusiso-moyo-lt-gen-ret-min-o...,Person,"Sibusiso MOYO, Lt. Gen. (Ret.)",,,zw,,,,,...,CIA World Leaders,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,


# Descriptives and clean up

In [13]:
#the countries are coded as iso2 codes -> transform to iso 3
#keep both countries? could search for "ru" over regex functions
print(res_deduplicated.countries.unique())
print(res_deduplicated.schema.unique())
print(res_deduplicated.listing_date.unique())

['jp' 'cm' 'by' 'ru;ua' nan 'ru' 'tr' 'sm' 'il' 'de' 'kh' 'kp' 'al' 'sy'
 'mm' 'nz' 'be' 'bb' 'ls' 'sg' 'ir' 'bi' 'fj' 'pl' 'lk' 'me' 'bm' 'ua'
 'ck' 'x-so' 'az;ru' 'ao' 'ht' 'ae' 'cn' 'li' 'vu' 'ge' 'tm' 'tt' 'lv'
 've' 'xk' 'bo' 'cy' 'ni' 'bg' 'cd' 'in' 'tl' 'ws' 'de;fr;ru' 'om' 'se'
 'mr' 'ie' 'lu' 'kz;ru' 'vc' 'tn' 'ad' 'at' 'zw' 'au' 'sa' 'ag' 'si'
 'cy;ru' 'cv' 'az' 'hr' 'vg' 'gr' 'na' 'sc' 'cg' 'ml' 'by;de' 'to' 'ss'
 'gg' 'fi' 'pg' 'ru;us' 'la' 'ru;uz' 'gd' 'ba' 'rs' 'dk' 'bn' 'pk' 'dj'
 'fm' 'by;ru' 'kw' 'th' 'is' 'nu' 'nr' 'dm' 'sk' 'ly' 'uz' 'mn' 'fr' 'ne'
 'mv' 'ki' 'mu' 'nl' 'de;ru' 'bz' 'bt' 'cy-trnc' 'hn' 'ca' 'tw' 'dz' 'gn'
 'kr' 'jm' 'gt' 'qa' 'jo' 'ma' 'gw' 'mg' 'es' 'ro' 'ad;fr' 'bs' 'cu' 'ga'
 'hu' 'vn' 'pw' 'cz;ru;sk' 'co' 'tv' 'by;ua' 'gq' 'ar;ru;us' 'mz' 'np'
 'mt' 'bd' 'sn' 'sb' 'gh' 'lv;ru' 'mk' 'ci' 'mc' 'pt' 'ge;ru' 'mh' 'bf'
 'bw' 'kg;ru' 'lt' 'dd;ru' 'sz' 'gy' 'kg' 'km' 'et;za' 'py' 'ye' 'ng' 'mx'
 'so-som' 'md;ru' 'tg' 'pm' 'rw' 'pa' 'st' 'tz' 'sd' 'ee' 'z

In [14]:
#get missingness
is_null = res_deduplicated.isnull().sum()
display(is_null)

id                     0
schema                 0
name                   0
aliases           111668
birth_date         90596
countries           5812
addresses         236623
identifiers        67049
sanctions         237248
phones            232548
emails            227548
dataset                0
last_seen              0
first_seen             0
date_stamp             0
new_entry              0
deletion               0
unchanged              0
month                  0
listing_date           0
delisting_date    232113
dtype: int64

In [15]:
res_deduplicated.columns

Index(['id', 'schema', 'name', 'aliases', 'birth_date', 'countries',
       'addresses', 'identifiers', 'sanctions', 'phones', 'emails', 'dataset',
       'last_seen', 'first_seen', 'date_stamp', 'new_entry', 'deletion',
       'unchanged', 'month', 'listing_date', 'delisting_date'],
      dtype='object')

In [16]:
res_deduplicated['new_listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res_deduplicated['new_listing_count'] = res_deduplicated.groupby('month')['new_entry'].transform('nunique')
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged,month,listing_date,delisting_date,new_listing_count
0,NK-22qMyyFP4bD3QhZ7BAFXs6,Person,SAKAMOTO Tetsushi,,,jp,,,,,...,2021-07-26 11:55:45,2022-01-19 01:27:13,20220119,True,False,False,2022-01,20220119,,1
1,NK-232XviNCU8o2PiPCQC3Cg2,Person,AMASSOUMOU NDIFOANE,,1951-10-24,cm,,,,,...,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,,1
2,NK-24isa6tapAYCqtEnqbUdzv,Person,DEMIDOVICH VASILIJ,Demidovich Vasili Nikolaevich;Demidovich Vasil...,1961-12-24,by,,,,,...,2022-06-22 18:15:31,2021-07-26 11:55:45,20220622,True,False,False,2022-06,20220622,,1
3,NK-25Z3H4pijwnkYMmpybVYzQ,Person,Мария Владимировна Пирогова,Mariya Vladimirovna Pirogova;Pirogova Mariia V...,1993-05-13,ru;ua,,3410105889,,,...,2022-11-13 18:15:42,2022-11-12 06:20:00,20221113,True,False,False,2022-11,20221113,,1
4,NK-26h6Ukv23GDmJUyhgmK5ML,Person,Aleksandr Aleksandrovich Zhuravlev,Александр Александрович Журавлёв,,,,,,,...,2022-05-21 18:14:04,2022-05-21 06:15:45,20220521,True,False,False,2022-05,20220521,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237243,us-cia-zimbabwe-obadiah-moyo-min-of-health-chi...,Person,Obadiah MOYO,,,zw,,,,,...,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,,1
237244,us-cia-zimbabwe-oppah-muchinguri-kashiri-min-o...,Person,Oppah MUCHINGURI-KASHIRI,,,zw,,,,,...,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,,1
237245,us-cia-zimbabwe-sekai-nzenza-min-of-industry-c...,Person,Sekai NZENZA,,,zw,,,,,...,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,,1
237246,us-cia-zimbabwe-sibusiso-moyo-lt-gen-ret-min-o...,Person,"Sibusiso MOYO, Lt. Gen. (Ret.)",,,zw,,,,,...,2021-07-26 11:55:45,2022-01-01 03:04:33,20220101,True,False,False,2022-01,20220101,,1


In [17]:
#res_deduplicated = res_deduplicated.drop(['last_seen', 'first_seen','new_entry', 'deletion','unchanged'],axis = 1)

save and zip the file

In [22]:
import zipfile
import io
import pickle

def compress_df_to_zip(df, zip_filename, df_filename):
    # Create an in-memory binary stream
    buffer = io.BytesIO()

    # Serialize the DataFrame using pickle
    pickle.dump(df, buffer)

    # Reset the buffer position to the beginning
    buffer.seek(0)

    # Create a zip file and add the serialized DataFrame to it
    with zipfile.ZipFile(zip_filename, 'w', compression=zipfile.ZIP_DEFLATED) as zip_file:
        zip_file.writestr(df_filename, buffer.getvalue())

# Example usage:
# Assuming df is your Pandas DataFrame
df = res

# Specify the filenames
zip_filename = 'data_pep3.zip'
df_filename = 'data_pep3.pkl'

# Compress the DataFrame and save it to a zip file
compress_df_to_zip(df, zip_filename, df_filename)


OSError: [Errno 28] No space left on device

In [21]:
#res.to_pickle('pep3.zip')

OSError: [Errno 28] No space left on device

In [19]:
res.to_csv("res.csv")

OSError: [Errno 28] No space left on device

In [None]:
#import zipfile

#def compress_csv(input_csv, output_zip):
   # with zipfile.ZipFile(output_zip, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        #zipf.write(input_csv)
#input_csv = 'res.csv'
#output_zip = 'peps_csv2.zip'
#compress_csv(input_csv, output_zip)