In [None]:
import cudf
import numpy as np

# Pipeline

In [None]:
''' 
+++++ UTILITY ATTRIBUTES +++++
    
    user_code --> String --> (Anonymized) code for the customer that owns this utility
    customer_code --> String --> Combined with user_code provides a unique identifier for the utility. Even this field is anonymized
    city --> String --> City where the utility is located
    address --> String --> (Anonymized) address of the utility location
'''

In [None]:
''' 
+++++ CUSTOMER ATTRIBUTES +++++
    
    user_code --> String --> (Anonymized) code that identifies the customer
    nominative --> String --> (Anonymized) customer name
    sex --> String --> Sex of the customer. It could be ‘M’, ‘F’, ‘P’, with ‘P’ denoting that the customer is a commercial activity (VAT number)
    age --> Int --> Age of the customer, set to null for commercial activities (sex = ‘P’). Its value must be >= 18
'''

In [None]:
''' 
+++++ INVOICE ATTRIBUTES +++++
    
    bill_id --> Int --> Invoice identifier
    F1_kWh --> Float --> kWh of electricity consumed in the F1 time slot
    F2_kWh --> Float --> kWh of electricity consumed in the F2 time slot
    F3_kWh --> Float --> kWh of electricity consumed in the F3 time slot 
    date --> Date --> Start date
    light_start_date --> Date --> Start date of electricity invoice
    light_end_date --> Date --> End date of electricity invoice
    tv --> Float --> Television fee to pay
    gas_amount --> Float --> Gas fee to pay
    gas_average_cost --> Float --> Average cost of gas
    light_average_cost --> Float --> Average cost of electricity
    emission_date --> Date --> Emission date
    supply_type --> String --> Supply type (‘light’, ‘gas’, ‘gas and light’)
    gas_start_date --> Date --> Start date of gas invoice
    gas_end_date --> Date --> End date of gas invoice
    extra_fees --> Float --> Extra fees to pay
    gas_consumption --> Float --> Consumed gas
    light_consumption --> Float --> Consumed electricity
    gas_offer --> Float --> Name of the subscribed gas plan (anonymized)
    light_offer_type --> String --> Kind of plan for the electricity (‘single zone’, ‘bizone’, etc.)
    light_offer --> String --> Name of the subscribed electricity plan (anonymized)
    total_amount --> Float --> gas_amount + light_amount + extra_fees
    howmuch_pay --> Float --> Overall amount to pay, computed as total_amount + tv
    light_amount --> Float --> Amount to pay for the electricity
    average_unit_light_cost --> Float --> Average cost for electricity
    average_light_bill_cost --> Float --> Average cost for the electricity invoice
    average_unit_gas_cost --> Float --> Average cost for gas
    average_gas_bill_cost --> Float --> Average cost for the gas invoice
    billing_frequency --> String --> Billing frequency (‘monthly’, ‘quarterly’, etc.)
    bill_type --> String --> Kind of invoice (False means a “standard bill”)
    gas_system_charges --> Float --> Extra gas fees
    light_system_charges --> Float --> Extra electricity fees
    gas_material_cost --> Float --> Costs for gas
    light_transport_cost --> Float --> Extra electricity fees
    gas_transport_cost --> Float --> Extra gas fees
    light_material_cost --> Float --> Costs for electricity
'''

In [None]:
''' 
+++++ DATA INGESTION +++++
    • Read data from its source
    • Study how to deal with data that doesn’t fit in memory (!) --> Da chiede a Gagliardelli
    • Locate missing values
    • Locate outliers
    • Sort data
'''

# Loading Dataset

In [None]:
from base import *
base = BaseDfBench()

In [5]:
# READ DATA FROM ITS SOURCE
# --- remove _sample one day

# We can use bill_id as the index for our dataset --> meglio di no
# base.load_dataset('/data/invoices_sample.csv', 'csv', index_col='bill_id')

base.load_dataset('/data/invoices_sample.csv', 'csv')
df = base.get_df()
columns = base.get_columns()
df

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
0,0,0.0,0.0,0.0,CIPRESSA,C23B8FC235DF5479FB28D81E827A59B819BB0748,0298E5E66B5653CDAACADD156261A2A916A56C7E,F,81,63D286C61D20D76E9C9317BEB8644D5EE45134E4,...,,5AE3987CBA311BCAADC6274D28A7FB14B13B53C3,,False,-0.06,,0.06,,-0.2,
1,1,81.0,62.0,76.0,ROCCAVIVARA,CBE6B021F41D589269FBC724C131CBFC9905D950,FDAE2B1E0934BB76255A727CC9F3F00AA39982D9,M,38,8053C7982DE8A18746A8F9F0D1DC4B0C33CAC0FA,...,,5F88D510670063B20E789E6453BC2F3FB0BB67F7,,False,,9.15,,5.68,,23.13
2,2,0.0,0.0,0.0,VIDRACCO,BFAEB566B3BB78B9ABD4F86DB3A78BFD8489013B,FEBE7F29FB854C477AFA7073C70F0E6EA81C6B00,M,53,4F053D54B9F1B6FE565498E7FA726501FD8FDD18,...,,D4B8928760E729127EB4EB532C86127325FBA468,,False,1.24,,-10.13,,-7.13,
3,3,0.0,0.0,0.0,MEZZOLOMBARDO,CBD30C382C72ED4A3D9DC9D11C13C155930E7C66,9220A3E83BDCE622F70A1456C7898687FD3EDAC8,M,50,115C7D34A79C66AFCDB4EB65262595B197B0F861,...,"1,08 €/smc",BD4DEF66D7EF2D3D683D9C381D2359AD24B41D9C,,False,-0.88,,23.68,,17.66,
4,4,0.0,0.0,0.0,PINETO,4D17C711CADED6C1E9BD459088B4B80F24575FE7,642C21354EF6017D13220517E2A25D4A0442D4C5,M,65,C7734B97A212004CEA25956D8FDFEF068CD2B8F6,...,"0,86 €/smc",C0622AFD22384A4B54CD1ACAF7D988B89853A5AC,,False,14.71,,141.57,,63.59,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,26.0,25.0,33.0,CINTANO,2D244E5D90065985546AEA4D52F54C9705B8038B,102CADC9FC8C745FE3B8C27F7267E07C9691728A,M,55,6D3FDD0A2EEEE1A2C1AC23E6C316048DD2CCE859,...,,53D29ACDEB11F45D4F305A4D00E5F03C1F2CB674,monthly,False,,13.08,,7.62,,15.65
1996,1996,71.0,61.0,88.0,TREZZONE,A01243282C24F561D6B54F1C1FCB45AE80F75B0A,9C3B5B4093FC99569E5BBA918A977681A46F52DF,M,48,52315237533E1A6504E3795EAC1D5586744C62FD,...,,2B8D02A2AD117EC401BCEF3C0651FEEBB6609D1C,monthly,False,,9.2,,8.76,,25.8
1997,1997,37.0,71.0,100.0,RIPATRANSONE,469AB252FF0A2C60B32D0337C933E3565E88C0C2,1B451B6AA312EFF93372D44EC6C24C0110A628A1,F,32,2777855F19667FBFEDEAEF181E7ABC8584262127,...,,A1BFA0E4833E27AB398F4420414E037D250F0A7E,monthly,False,,8.7,,8.66,,24.14
1998,1998,74.0,78.0,92.0,MONSELICE,9E24BB26322BD95D11E26DFB80AA1168C9661FDC,F6BADD2456E283A8DCA6F3FF640724072108BAEF,F,54,8DC8820FB50005ED9CC4E7C187B9703460941856,...,,DED078288677E4E8A15874405E79C26AC78747B0,monthly,False,,10.21,,8.96,,30.43


In [None]:
#verica di user_code+customer_code = PK utility
df.groupby(by=['user_code', 'customer_code']).count()

#PK customer = user_code

#PK ivoice = bill_id


## Load big Dataset

In [None]:
# STUDY HOW TO DEAL WITH DATA THAT DON'T FIT IN MEMORY (!)
# Bella storia
!du -sh '/data/invoices_sample.csv'

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
from dask.utils import parse_bytes
import cudf
import dask_cudf

In [None]:
cluster = LocalCUDACluster(
    CUDA_VISIBLE_DEVICES="0",
    rmm_pool_size=parse_bytes("15GB"), # This GPU has 16GB of memory
    device_memory_limit=parse_bytes("10GB"),
)
client = Client(cluster)
client

In [None]:
%%time
ddf = dask_cudf.read_csv("/data/invoices.csv", chunksize="1 GiB")
print(f"DF partitions: {ddf.npartitions}")
print(f"DF: {ddf}")

# Funzioni utili

In [10]:
def col_type(df, find=['numeric']):
    """
    Return a list containing all columns of the specified type
    df : dataframe
    type : list or single value with the specified value/s
    """
    tipo = []
    if type(find) == list:
        tipo.extend(find)
    else:
        tipo.append(find)
    
    col = set() #no duplicate column name
    
    for i in tipo:
        if i == 'numeric':
            col.update(df.select_dtypes(include=np.number).columns)
        elif i == 'float':
            col.update(df.select_dtypes(include=float).columns)
        elif i == 'int':
            col.update(df.select_dtypes(include=int).columns)
        elif i == 'date':
            col.update(df.select_dtypes(include=np.datetime64).columns)
        elif i == 'string':
            col.update(df.select_dtypes(include=object).columns)
            
    return list(col)

In [11]:
#search by pattern
def search_by_pattern(pattern):
    """
    Return a list of cloumns containing the desired pattern
    """
    import re, itertools
    pattern_list = df.columns.str.contains(re.compile(pattern)).tolist()
    #print(gas_list)
    #df.loc[:, itertools.compress(df.columns.tolist(), gas_list)].drop('gas_offer', axis=1)
    return list(itertools.compress(df.columns.tolist(), pattern_list))

In [12]:
# Dovrebbe accorgersi di colonne con similarità > soglia
# TODO --> FUNZIONE DA RIFARE PERCHé USA I SET E QUINDI NON CONFRONTA RIGA-RIGA 
#          MA SET(ELEMENTI_COL1) CON SET(eLEMENTI_COL2)
def get_duplicate_col(df, soglia=0.9):
        """
        Return a list of duplicate columns, if exists.
        Duplicate columns are those which have same values for each row.
        """  
        cols = df.columns.values
        #return [(cols[i], cols[j]) for i in range(0, len(cols)) for j in range(i+1, len(cols)) if self.df[cols[i]].equals(self.df[cols[j]])]
        
        return [(cols[i], cols[j]) for i in range(0, len(cols)) for j in range(i+1, len(cols)) if len(set(cols[i]) & set(cols[j])) / float(len(set(cols[i]) | set(cols[j]))) > soglia ]

In [None]:
df['gas_average_cost'].count()

In [123]:
def get_duplicate_col_V2(df, soglia=0.9, nan=False):
    """
    General versione of get_duplicate_columns
    nan : if True the divisor doesn't consider nan value
    """
    cols = df
    duplicate_col = []
        
    for i in range(cols.shape[1]):
        for j in range(i+1, cols.shape[1]):
            divisore = cols.shape[0]
            if nan == True:
                if (cols.iloc[:,i].count() > cols.iloc[:,j].count()):
                    divisore = cols.iloc[:,i].count()
                else:
                    divisore = cols.iloc[:,j].count()
            if (cols.iloc[:,i].dtype == cols.iloc[:,j].dtype) and ((cols.iloc[:,i] == cols.iloc[:,j]).sum() / float(divisore) >= soglia):
                duplicate_col.append((cols.iloc[:,i].name, cols.iloc[:,j].name))
                
    return duplicate_col

In [14]:
def get_duplicate_columnss(df):
    cols = df
    duplicate_col = []

    for i in range(cols.shape[1]):
        for j in range(i+1, cols.shape[1]):
            if cols.iloc[:,i].equals(cols.iloc[:,j]):
                duplicate_col.append((cols.iloc[:,i].name, cols.iloc[:,j].name))

    return duplicate_col

In [16]:
mesi = {
    'gennaio': '01',
    'febbraio': '02',
    'marzo' : '03',
    'aprile': '04',
    'maggio': '05',
    'giugno': '06',
    'luglio': '07',
    'agosto': '08',
    'settembre': '09',
    'ottobre': '10',
    'novembre': '11',
    'dicembre': '12',
}

def convert_to_datetime(col): 
    if col.dtype == object:
        a = cudf.DataFrame()
        a[['day', 'month', 'year']] = col.str.split(expand=True)
        a['month'] = a.month.str.lower().map(mesi)
        col = cudf.to_datetime(a)
    return col

# 1. INGESTION & DISCOVERY

## Analysis

In [None]:
#tipi di dato delle colonne
base.get_columns_types()

In [6]:
import re
#df['light_amount'].str.replace(',', '.').str.extract('-?([0-9]*\.[0-9]+)') 
print(cudf.to_numeric(df['light_amount'].str.replace(',', '.'), errors='coerce').isna().sum()  )
df['light_amount'].isna().sum()

752


744

In [132]:
# Estrazione unità di misura. Unirla a average_gas_bill_cost
avg_bill_cost_um = df['average_gas_bill_cost'].str.replace(',', '.').str.split(expand=True).iloc[:,-1].value_counts().index[0]
avg_bill_cost_um

'€/smc'

In [133]:
# Rimozione unità di misura
df['average_gas_bill_cost'] = df['average_gas_bill_cost'].str.replace(',', '.').str.extract('-?([0-9]*\.[0-9]+)').astype(float)
df['average_gas_bill_cost']

0       <NA>
1       <NA>
2       <NA>
3       1.08
4       0.86
        ... 
1995    <NA>
1996    <NA>
1997    <NA>
1998    <NA>
1999    <NA>
Name: average_gas_bill_cost, Length: 2000, dtype: float64

In [30]:
# Trovare le tipologie di colonne a seconda della tipologia
int_cols = col_type(df, find=['int'])
string_cols = col_type(df, find=['string']) #object
float_cols = col_type(df, find=['float'])
numeric_cols = col_type(df, find=['numeric'])
date_cols = col_type(df, find=['date'])

In [None]:
numeric_cols

In [34]:
df['bill_type'].value_counts()

False    2000
Name: bill_type, dtype: int32

In [None]:
#esempio
base.locate_null_values('address')

## Locate Missing Values

In [50]:
# LOCATE MISSING VALUES
rows = df.shape[0]
#wait(ddf.shape[0])
print("Total rows", rows)

# These are the colums with at least one missing value
column_count_nan = []

for c in columns:
    
    if (not base.locate_null_values(c).empty):
        count = base.locate_null_values(c).shape[0]
        #print(c, "\t Affected rows:", count)
        p = {
            'column_name' : c,
            'nr_nan': count
        }
    else:
        #print(c, "\t Qui no missing values")
        p = {
            'column_name' : c,
            'nr_nan': 0
        }
    column_count_nan.append(p)

Total rows 2000


In [66]:
column_with_nan = []

perc = 100
soglia = (perc * df.shape[0]) / 100
print(soglia)

for c in column_count_nan:
    if c['nr_nan'] > soglia:
        #print(c['column_name'])
        column_with_nan.append(c['column_name'])
        
column_with_nan

# Si può notare come nessuna colonna abbia il 100% di valori nulli, ma ci sono altresì tante 
# colonne con un numero elevato di nan. Poche colonne hanno un numero basso di nan.

2000.0


[]

In [67]:
#Visto che address e nominative hanno lo stesso numero di NA, vale la 
#pena controllare ed eventualmente eliminare le righe?

base.locate_null_values('nominative')

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
1611,1611,23311.0,2219.0,467.0,VIETRI DI POTENZA,,,P,0,8CE314D06FEFFD54C0F1497C20C479F376102747,...,,F631ED0EF8EC71428366F15C369AA53792A6F20D,monthly,False,,1176.31,,481.81,,2067.89
1627,1627,210.0,149.0,226.0,VIETRI DI POTENZA,,,P,45,A1E7C992D46A2D6D911C3595DF603D3F2C6BEB37,...,,1E9BC1B19300B6E472980894714A4C8382C565EA,monthly,False,,76.08,,50.45,,85.57
1639,1639,95.0,52.0,105.0,VIETRI DI POTENZA,,,P,0,3CE203296BE19C1257020DA7EFEB84A0D3F4F7B7,...,,CC5C0DB0AA0034008B3398775B1EC89FC21A9DDB,monthly,False,,40.8,,29.47,,45.65
1858,1858,65.0,34.0,60.0,VIETRI DI POTENZA,,,P,66,D3117D65827932EB6A7F54F73EB1700C0D9F289C,...,,EB24C1C62642D39E65FB5BA50044153D2BF6EEC9,monthly,False,,44.17,,36.34,,47.11


In [None]:
base.locate_null_values('address')

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
1611,1611,23311.0,2219.0,467.0,VIETRI DI POTENZA,,,P,0,8CE314D06FEFFD54C0F1497C20C479F376102747,...,,F631ED0EF8EC71428366F15C369AA53792A6F20D,monthly,False,,1176.31,,481.81,,2067.89
1627,1627,210.0,149.0,226.0,VIETRI DI POTENZA,,,P,45,A1E7C992D46A2D6D911C3595DF603D3F2C6BEB37,...,,1E9BC1B19300B6E472980894714A4C8382C565EA,monthly,False,,76.08,,50.45,,85.57
1639,1639,95.0,52.0,105.0,VIETRI DI POTENZA,,,P,0,3CE203296BE19C1257020DA7EFEB84A0D3F4F7B7,...,,CC5C0DB0AA0034008B3398775B1EC89FC21A9DDB,monthly,False,,40.8,,29.47,,45.65
1858,1858,65.0,34.0,60.0,VIETRI DI POTENZA,,,P,66,D3117D65827932EB6A7F54F73EB1700C0D9F289C,...,,EB24C1C62642D39E65FB5BA50044153D2BF6EEC9,monthly,False,,44.17,,36.34,,47.11


In [89]:
df[df['city'] == 'VIETRI DI POTENZA']

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
1611,1611,23311.0,2219.0,467.0,VIETRI DI POTENZA,,,P,0,8CE314D06FEFFD54C0F1497C20C479F376102747,...,,F631ED0EF8EC71428366F15C369AA53792A6F20D,monthly,False,,1176.31,,481.81,,2067.89
1627,1627,210.0,149.0,226.0,VIETRI DI POTENZA,,,P,45,A1E7C992D46A2D6D911C3595DF603D3F2C6BEB37,...,,1E9BC1B19300B6E472980894714A4C8382C565EA,monthly,False,,76.08,,50.45,,85.57
1639,1639,95.0,52.0,105.0,VIETRI DI POTENZA,,,P,0,3CE203296BE19C1257020DA7EFEB84A0D3F4F7B7,...,,CC5C0DB0AA0034008B3398775B1EC89FC21A9DDB,monthly,False,,40.8,,29.47,,45.65
1858,1858,65.0,34.0,60.0,VIETRI DI POTENZA,,,P,66,D3117D65827932EB6A7F54F73EB1700C0D9F289C,...,,EB24C1C62642D39E65FB5BA50044153D2BF6EEC9,monthly,False,,44.17,,36.34,,47.11


In [None]:
# PROVO A VEDERE SE SONO PRESENTI ALTRI RECORDS DEGLI UTENTI A CUI MANCANO NOMINATIVE E ADDRESS
v = df['user_code'].value_counts()
v.index[v.gt(1)]

StringIndex(['3C9F70DE0664CBFE7B11F547A2A6B63BA05888D2'
 '17CB3BA9FE2A2B515718DA60309B9B37A37D0442'
 '00106F4E6FEEFB8C81BADE0E7E985D85DFA3B9D9'
 '00C7D7B84EF973F1D48AFAD0DDE706A611F0D60F'
 '05ED97BCA5EB36FB909C6F37E3D62FB4AC897A1B'
 '0E3AB3C811F1849F661F3E673F11633F9CEA9C30'
 '20E9745680DF187F6A42D25B339CD2A9AC4C9CF1'
 '3D8C8FAE543F4F8DA352F5D629287B14A19793FF'
 '67D7167747A9300607A82FF48C605C7DF34B1293'
 '6F92ACB7982545A6A0B78E36EABDFF7342AAFEC5'
 '7033149E8D91EA1C227D5C8A2512C4835F4D4BF1'
 '76CCEAE2291D3465A0760FF4F310AA7FB3303E27'
 '7C560D5D7B0FB5A7E872DC16F0577082A6682642'
 '80C4DBC8FACBA3F46D1C679939CEFEB9ADFA33B1'
 '948667A12DC803CD8C28CE665F1F44C2B94E02FF'
 'A79F6889906F9B805D96630CCE22828BACE87307'
 'B0BFC479591E9A72E67D5C1946799A006296BC7D'
 'B44D4954D2A583E874A621067A7FEE43C97525B3'
 'CBF9728EE95EBD7AE46ECC84C17AE406F856EC7C'
 'CD2E8CE8D79B2C4D440B0810C63315292D648CC8'
 'EF53E5E1DE668A7ABB88DE623D25E01FA5EA7A02'
 'FBEB4C6D7D829632C3A01129B4B23F335C5C144F'], dtype='object')

In [87]:
# DATASET CON GLI user_code CHE COMPAIONO PIù DI UNA VOLTA
df[df['user_code'].isin(v.index[v.gt(1)])]

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
61,61,-30.0,20.0,26.0,VERONA,9918823F6ACABC8DBE5309F71A8219AE7C4ADE9C,BDD7B13FF222D07687318CDFEBBC1F5D0CD21692,F,65,05ED97BCA5EB36FB909C6F37E3D62FB4AC897A1B,...,,B67BEAA8E45A4A1C8633B2C21E0ADCC2EF5976BC,,False,,7.57,,5.12,,6.5
64,64,22.0,0.0,0.0,TREVENZUOLO,67C880B1F8360FFD61CA876277C27AE42CC1E3F3,A59EA7A4097DDCAA37A3FBE515C065188191456C,P,31,3C9F70DE0664CBFE7B11F547A2A6B63BA05888D2,...,,FE750C0FA1EDDAA3540E42F2CA79A721BD95B160,,False,,1.09,,0.2,,1.83
67,67,22.0,0.0,0.0,SALERANO SUL LAMBRO,EAE353697612C5290B53F13E23885B1CA90AA95F,A59EA7A4097DDCAA37A3FBE515C065188191456C,P,31,3C9F70DE0664CBFE7B11F547A2A6B63BA05888D2,...,,B1292FFCC462370CA20E6C57B2AFF3BBBE131B89,,False,,1.09,,0.2,,1.83
69,69,125.0,0.0,0.0,GAMBARANA,D3537B73DE3099FD799AF3B46EC5ECFAA5FCADF8,EF9A4AED5533DEA4DBECB5A295070C8D932A3F02,F,56,17CB3BA9FE2A2B515718DA60309B9B37A37D0442,...,,451689EC5222A8221B65AF94012B4D570329FD5D,,False,,10.56,,4.9,,17.42
70,70,52.0,0.0,0.0,CASALINO,0A70AF43915814C8E4359DA20AEE5A0611BA7044,50145DC2CA7A3E998DDA2BB037D94AB0BC570414,F,67,3D8C8FAE543F4F8DA352F5D629287B14A19793FF,...,,7E41A6D73F522BAB1DF00CDAA3C9DA7E22173E11,,False,,7.19,,4.06,,9.24
73,73,10.0,0.0,0.0,RONCHIS,61B43563ACF3ED5B154614589518E0DE79B30ECD,9A28B1DC562118B55400E209CB4FB234233AF0C7,M,49,7033149E8D91EA1C227D5C8A2512C4835F4D4BF1,...,,37888172CD8121AEB67D3C75BB02A37DABC5E022,,False,,0.42,,0.08,,1.08
76,76,66.0,0.0,0.0,SERNIO,E2493130ECA069A904C4E556ACDFA9F3E14A1822,C2B51D1771B9850A7DE6F2CDE5CE92E593D9E3E6,F,65,EF53E5E1DE668A7ABB88DE623D25E01FA5EA7A02,...,,F49C2513A1E4D1117FC5363314195C5AC81FF528,,False,,5.58,,2.59,,9.21
85,85,22.0,0.0,0.0,CASALEGGIO BOIRO,1972EF0648D14D2D0E0B639D3C1A36E6C7C5212E,A59EA7A4097DDCAA37A3FBE515C065188191456C,P,31,3C9F70DE0664CBFE7B11F547A2A6B63BA05888D2,...,,9A6631C84F3A1BB412D63B31E7A77A0C20E96313,,False,,1.09,,0.2,,1.83
90,90,27.0,20.0,26.0,PRATA D'ANSIDONIA,E8CDC8E4C1F69D76121E1A5E05A29B264FE7AE2F,BDD7B13FF222D07687318CDFEBBC1F5D0CD21692,F,65,05ED97BCA5EB36FB909C6F37E3D62FB4AC897A1B,...,,1FBF090923209A2ECB3206F6B83F5D4D4257B970,,False,,7.48,,5.12,,6.27
101,101,23.0,13.0,12.0,LONGARONE,01AD4008A90BB2BA3DDF4D4F7B99D194584E2E11,D4DFE7FCAFF7024AF255963BD0568FDDB64C8FAF,M,83,FBEB4C6D7D829632C3A01129B4B23F335C5C144F,...,,4A6BB50C97D02A3687FA185BDED51791C3448899,,False,,8.6,,5.16,,8.4


In [None]:
# Le date potrebbero fornirci informazioni sulla billing_frequency, o viceversa, come lo stabiliamo?
# Cosa indica la colonna dell'emission_date??

df.loc[0, 'billing_frequency']

## Locate Outliers

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
from dask.utils import parse_bytes
import cudf
import dask_cudf

In [None]:
from dcudf import DCUDF

#ddf = DCUDF("/data/invoices.csv", 0.1, True)
ddf = DCUDF()

In [None]:
ddf.cluster

In [None]:
ddf.client

In [None]:
ddf.df

In [None]:
ddf.get_columns_types()

In [None]:
tmp = ddf.locate_outliers('F1_kWh')
tmp.compute()

In [None]:
type = ddf.df.dtypes
num = type[type!='object'][type!='bool']
num = num.index.to_list()
num

In [None]:
tmp = ddf.locate_outliers("F1_kWh")
tmp
# a lot of task per partition

In [None]:
tmp = tmp.persist()
tmp
# no more than 1 task per partition

In [None]:
tmp.shape

In [None]:
tmp = ddf.locate_outliers("F1_kWh")
tmp = tmp.compute()
tmp.shape

In [None]:
from tqdm import tqdm

for col in tqdm(num):
    tmp = ddf.locate_outliers(col)
    tmp = tmp.compute()
    
    print(f"{col} has {tmp.shape[0]} outliers")

## Sort Data

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
from dask.utils import parse_bytes
import cudf
import dask_cudf

from dcudf import DCUDF

#ddf = DCUDF("/data/invoices.csv", 0.1, True)
ddf = DCUDF()

In [None]:
ddf.sort("F1_kWh").compute()
# not working

In [None]:
tmp = ddf.sort_index("F1_kWh").compute()
# not working

In [None]:
idx = tmp.index.to_arrow().to_pylist()
# too mutch long list

In [None]:
tmp.merge(ddf.df, on=["F1_kWh"], how="left")

# 2. VALIDATION

In [None]:
''' 
+++++ DATA VALIDATION +++++
    • Check data range
    • Check column uniqueness
    • Find data-mismatched data types
'''

## Check Data Range

### Age nulla per PIVA

In [None]:
# Controllo se chi ha P ha età nulla
df.loc[(df['sex']=='P') & (df['age']>=0), 'age'] = np.nan
df.loc[(df['sex']=='P') & (np.isnan(df['age']))]

### Age >= 18

In [None]:
# Age >= 18
# Age non ha valori nulli (già controllato in precedenza IN TEORIA)
#
df['age'].isna().sum()

In [None]:
# Che fare con i minorenni? Li mettiamo tutti a 18 anni?
df[df['age'] < 18]

### Translation in English

In [91]:
# Dobbiamo metterli in inglese? Si può fare con un map
ing = {
    'luce': 'light',
    'gas e luce': 'gas and light',
    'gas' : 'gas',
}
df['supply_type'] = df['supply_type'].map(ing)

In [92]:
df['supply_type'].value_counts(dropna=False)

light            1116
gas               744
gas and light     140
Name: supply_type, dtype: int32

### billing_freq & bill_type Analysis

In [93]:
# Che fare qua? Billing frequency (‘monthly’, ‘quarterly’, etc.)
# Si potrebbero calcolare
df['billing_frequency'].value_counts(dropna=False)

<NA>        1569
monthly      422
bimester       9
Name: billing_frequency, dtype: int32

In [94]:
# Kind of invoice (False means a “standard bill”)
df['bill_type'].value_counts(dropna=False)

False    2000
Name: bill_type, dtype: int32

### light_offer_type Analysis

In [97]:
# Kind of plan for the electricity (‘single zone’, ‘bizone’, etc.)
df['light_offer_type'].value_counts(dropna=False)

light single zone    1079
light                 744
light multi zones     103
ligth bizone           43
light                  31
Name: light_offer_type, dtype: int32

In [111]:
# I 2 valori di light mi puzzano, famo uno strip?
df['light_offer_type'] = df['light_offer_type'].str.strip()
df['light_offer_type'].value_counts(dropna=False)
# GHAVEVO RAGIONE Vè

light single zone    1079
light                 775
light multi zones     103
ligth bizone           43
Name: light_offer_type, dtype: int32

### Applico la funzione di strip a tutte le colonne con stringhe

In [110]:
# STRIPPATINA GENERALE (alle colonne con stringhe)
df = base.strip(string_cols, ' ')

## Check Column Uniqueness

In [None]:
# The story changes here, we got these pairs as candidates as duplicate columns 
# (those which have same values for each row.)

# ATTENZIONE!! Probabile che le date del gas siano invertite, trovare un modo per confermarlo

In [None]:
#provo a concatenare lo stesso dataset più volte solo per vedere come regge lo scaling di dati
big_df = cudf.concat([df for _ in range(100)], sort=False) #200k con la paglia

In [113]:
base.get_duplicate_columnss()
# Ne vengono individuate solo 5 di coppie 100% fedeli, 
# ma cambia la questione se abbassiamo la soglia? Ci sono degli errori sopra perchè 
# quelle 100% uguali non vengono riconosciute, mentre qua sì

[('date', 'light_start_date'),
 ('date', 'gas_end_date'),
 ('light_start_date', 'gas_end_date'),
 ('light_end_date', 'gas_start_date'),
 ('gas_average_cost', 'average_unit_gas_cost')]

In [125]:
# Versione con soglia. (Lentina!)
duplicate_columns = get_duplicate_col_V2(df, soglia=1, nan=True)
duplicate_columns

[('date', 'light_start_date'),
 ('date', 'gas_end_date'),
 ('light_start_date', 'gas_end_date'),
 ('light_end_date', 'gas_start_date'),
 ('gas_average_cost', 'average_unit_gas_cost')]

[]

## Find Data Mismatch

In [126]:
date_col_list = search_by_pattern('date')
for col in date_col_list:
    df[col] = convert_to_datetime(df[col])
df[date_col_list]

Unnamed: 0,date,light_start_date,light_end_date,emission_date,gas_start_date,gas_end_date
0,2018-04-16,2018-04-16,2019-11-25,2020-12-31,2019-11-25,2018-04-16
1,2020-12-05,2020-12-05,2020-12-31,2020-12-31,2020-12-31,2020-12-05
2,2020-12-05,2020-12-05,2020-12-31,2020-12-31,2020-12-31,2020-12-05
3,2020-10-03,2020-10-03,2020-12-31,2020-12-31,2020-12-31,2020-10-03
4,2020-12-16,2020-12-16,2020-12-31,2020-12-31,2020-12-31,2020-12-16
...,...,...,...,...,...,...
1995,2020-12-05,2020-12-05,2021-01-12,2021-01-12,2021-01-12,2020-12-05
1996,2020-12-05,2020-12-05,2021-01-12,2021-01-12,2021-01-12,2020-12-05
1997,2020-12-05,2020-12-05,2021-01-12,2021-01-12,2021-01-12,2020-12-05
1998,2020-12-05,2020-12-05,2021-01-12,2021-01-12,2021-01-12,2020-12-05


In [None]:
df['date'] == df['light_start_date']

In [128]:
# Seguendo il GroundTruth mystico
mappone = {'bill_id': 'int64',
 'F1_kWh': 'float64',
 'F2_kWh': 'float64',
 'F3_kWh': 'float64',
 'city': 'str',
 'address': 'str',
 'nominative': 'str',
 'sex': 'str',
 'age': 'int64',
 'user_code': 'str',
 'date': 'datetime64[s]',
 'light_start_date': 'datetime64[s]',
 'light_end_date': 'datetime64[s]',
 'tv': 'float64',
 'gas_amount': 'float64',
 'gas_average_cost': 'float64',
 'light_average_cost': 'float64',
 'emission_date': 'datetime64[s]',
 'supply_type': 'str',
 'gas_start_date': 'datetime64[s]',
 'gas_end_date': 'datetime64[s]',
 'extra_fees': 'float64',
 'gas_consumption': 'float64',
 'light_consumption': 'float64',
 'gas_offer': 'float64',
 'light_offer_type': 'str',
 'light_offer': 'str',
 'howmuch_pay': 'float64',
 'total_amount': 'float64',
 'light_amount': 'float64',
 'average_unit_light_cost': 'float64',
 'average_light_bill_cost': 'float64',
 'average_unit_gas_cost': 'float64',
 'average_gas_bill_cost': 'float64',
 'customer_code': 'str',
 'billing_frequency': 'str',
 'bill_type': 'str',
 'gas_system_charges': 'float64',
 'light_system_charges': 'float64',
 'gas_material_cost': 'float64',
 'light_transport_cost': 'float64',
 'gas_transport_cost': 'float64',
 'light_material_cost': 'float64'
}

In [213]:
#Check sui tipi di dato attuali
type_dict = base.get_columns_types()
type_dict

{'bill_id': 'int64',
 'F1_kWh': 'float64',
 'F2_kWh': 'float64',
 'F3_kWh': 'float64',
 'city': 'object',
 'address': 'object',
 'nominative': 'object',
 'sex': 'object',
 'age': 'int64',
 'user_code': 'object',
 'date': 'datetime64[s]',
 'light_start_date': 'datetime64[s]',
 'light_end_date': 'datetime64[s]',
 'tv': 'float64',
 'gas_amount': 'float64',
 'gas_average_cost': 'float64',
 'light_average_cost': 'float64',
 'emission_date': 'datetime64[s]',
 'supply_type': 'object',
 'gas_start_date': 'datetime64[s]',
 'gas_end_date': 'datetime64[s]',
 'extra_fees': 'float64',
 'gas_consumption': 'float64',
 'light_consumption': 'float64',
 'gas_offer': 'float64',
 'light_offer_type': 'object',
 'light_offer': 'object',
 'howmuch_pay': 'float64',
 'total_amount': 'float64',
 'light_amount': 'float64',
 'average_unit_light_cost': 'float64',
 'average_light_bill_cost': 'float64',
 'average_unit_gas_cost': 'float64',
 'average_gas_bill_cost': 'float64',
 'customer_code': 'object',
 'billing_fr

In [130]:
#poichè probabilmente con i <NA> ho un sacco di problemi "lasciatemi castaareeee.. 
#con la chitarra in maaanoooo, lasciatemi castare.. sono un italiano"

df = base.fill_nan(np.nan)
df

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
0,0,0.0,0.0,0.0,CIPRESSA,C23B8FC235DF5479FB28D81E827A59B819BB0748,0298E5E66B5653CDAACADD156261A2A916A56C7E,F,81,63D286C61D20D76E9C9317BEB8644D5EE45134E4,...,,5AE3987CBA311BCAADC6274D28A7FB14B13B53C3,,False,-0.06,,0.06,,-0.20,
1,1,81.0,62.0,76.0,ROCCAVIVARA,CBE6B021F41D589269FBC724C131CBFC9905D950,FDAE2B1E0934BB76255A727CC9F3F00AA39982D9,M,38,8053C7982DE8A18746A8F9F0D1DC4B0C33CAC0FA,...,,5F88D510670063B20E789E6453BC2F3FB0BB67F7,,False,,9.15,,5.68,,23.13
2,2,0.0,0.0,0.0,VIDRACCO,BFAEB566B3BB78B9ABD4F86DB3A78BFD8489013B,FEBE7F29FB854C477AFA7073C70F0E6EA81C6B00,M,53,4F053D54B9F1B6FE565498E7FA726501FD8FDD18,...,,D4B8928760E729127EB4EB532C86127325FBA468,,False,1.24,,-10.13,,-7.13,
3,3,0.0,0.0,0.0,MEZZOLOMBARDO,CBD30C382C72ED4A3D9DC9D11C13C155930E7C66,9220A3E83BDCE622F70A1456C7898687FD3EDAC8,M,50,115C7D34A79C66AFCDB4EB65262595B197B0F861,...,"1,08 €/smc",BD4DEF66D7EF2D3D683D9C381D2359AD24B41D9C,,False,-0.88,,23.68,,17.66,
4,4,0.0,0.0,0.0,PINETO,4D17C711CADED6C1E9BD459088B4B80F24575FE7,642C21354EF6017D13220517E2A25D4A0442D4C5,M,65,C7734B97A212004CEA25956D8FDFEF068CD2B8F6,...,"0,86 €/smc",C0622AFD22384A4B54CD1ACAF7D988B89853A5AC,,False,14.71,,141.57,,63.59,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,26.0,25.0,33.0,CINTANO,2D244E5D90065985546AEA4D52F54C9705B8038B,102CADC9FC8C745FE3B8C27F7267E07C9691728A,M,55,6D3FDD0A2EEEE1A2C1AC23E6C316048DD2CCE859,...,,53D29ACDEB11F45D4F305A4D00E5F03C1F2CB674,monthly,False,,13.08,,7.62,,15.65
1996,1996,71.0,61.0,88.0,TREZZONE,A01243282C24F561D6B54F1C1FCB45AE80F75B0A,9C3B5B4093FC99569E5BBA918A977681A46F52DF,M,48,52315237533E1A6504E3795EAC1D5586744C62FD,...,,2B8D02A2AD117EC401BCEF3C0651FEEBB6609D1C,monthly,False,,9.20,,8.76,,25.80
1997,1997,37.0,71.0,100.0,RIPATRANSONE,469AB252FF0A2C60B32D0337C933E3565E88C0C2,1B451B6AA312EFF93372D44EC6C24C0110A628A1,F,32,2777855F19667FBFEDEAEF181E7ABC8584262127,...,,A1BFA0E4833E27AB398F4420414E037D250F0A7E,monthly,False,,8.70,,8.66,,24.14
1998,1998,74.0,78.0,92.0,MONSELICE,9E24BB26322BD95D11E26DFB80AA1168C9661FDC,F6BADD2456E283A8DCA6F3FF640724072108BAEF,F,54,8DC8820FB50005ED9CC4E7C187B9703460941856,...,,DED078288677E4E8A15874405E79C26AC78747B0,monthly,False,,10.21,,8.96,,30.43


In [212]:
# setto le colonne con i tipi di dato in mappone
df = base.cast_columns_types(mappone)

In [141]:
df['emission_date']

0      2020-12-31
1      2020-12-31
2      2020-12-31
3      2020-12-31
4      2020-12-31
          ...    
1995   2021-01-12
1996   2021-01-12
1997   2021-01-12
1998   2021-01-12
1999   2021-01-12
Name: emission_date, Length: 2000, dtype: datetime64[s]

In [196]:
# Abbiamo dei valori non castabili a float nelle colonne interessate

# Trovare le tipologie di colonne a seconda della tipologia
int_cols = col_type(df, find=['int'])
string_cols = col_type(df, find=['string']) #object
float_cols = col_type(df, find=['float'])
numeric_cols = col_type(df, find=['numeric'])
date_cols = col_type(df, find=['date'])

# Vediamo cos'ho nelle colonne str
#print(string_cols)
for c in string_cols:
    print("Col: ", c)
    cosa = df[c].str.replace(',', '.').str.split(expand=True).iloc[:,-1].value_counts().index[0]
    print(cosa)

Col:  nominative
A59EA7A4097DDCAA37A3FBE515C065188191456C
Col:  address
nan
Col:  user_code
3C9F70DE0664CBFE7B11F547A2A6B63BA05888D2
Col:  sex
M
Col:  billing_frequency
nan
Col:  city
CITERIORE
Col:  supply_type
light
Col:  extra_fees
0.00
Col:  total_amount
0.00
Col:  light_amount
nan
Col:  howmuch_pay
0.00
Col:  gas_amount
nan
Col:  tv
0.00
Col:  light_offer_type
zone
Col:  customer_code
00065CCAA76C3E17B28AB4D32D371C635D84B87A


In [193]:
# C'è della merda, ottimizzo la cosa riducendo il campo d'analisi a solo le colonne float64 di mappone
mappone_float64_cols = []

for (k, v) in mappone.items():
    if v == 'float64':
        mappone_float64_cols.append(k)

final_float_cols = [x for x in mappone_float64_cols if x not in float_cols]
mappone_float64_cols
float_cols
final_float_cols = [x for x in final_float_cols if x not in ['gas_offer']]

In [186]:
# Altro caso di colonna dal dubbio gusto
df['gas_offer']

0        4255330384700204909
1       18446744073709551615
2        1487916816871061346
3       10570449509150620332
4        1487916816871061346
                ...         
1995    18446744073709551615
1996    18446744073709551615
1997    18446744073709551615
1998    18446744073709551615
1999    18446744073709551615
Name: gas_offer, Length: 2000, dtype: uint64

In [194]:
# Vediamo cos'ho nelle colonne str
#print(string_cols)
lista_rename = []
for c in final_float_cols:
    cosa = df[c].str.replace(',', '.').str.split(expand=True).iloc[:,-1].value_counts().index[0]
    if cosa != '0.00' and cosa != 'nan':
        print("Col: ", c + ' ' + cosa)
        print(cosa)
        p = {
            c : c + ' ' + cosa
        }
        lista_rename.append(p)
        df[c] = df[c].str.replace(',', '.').str.extract('-?([0-9]*\.[0-9]+)').astype(float)
df

tv
gas_amount
extra_fees
howmuch_pay
total_amount
light_amount


Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,city,address,nominative,sex,age,user_code,...,average_gas_bill_cost,customer_code,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
0,0,0.0,0.0,0.0,CIPRESSA,C23B8FC235DF5479FB28D81E827A59B819BB0748,0298E5E66B5653CDAACADD156261A2A916A56C7E,F,81,63D286C61D20D76E9C9317BEB8644D5EE45134E4,...,,5AE3987CBA311BCAADC6274D28A7FB14B13B53C3,,False,-0.06,,0.06,,-0.20,
1,1,81.0,62.0,76.0,ROCCAVIVARA,CBE6B021F41D589269FBC724C131CBFC9905D950,FDAE2B1E0934BB76255A727CC9F3F00AA39982D9,M,38,8053C7982DE8A18746A8F9F0D1DC4B0C33CAC0FA,...,,5F88D510670063B20E789E6453BC2F3FB0BB67F7,,False,,9.15,,5.68,,23.13
2,2,0.0,0.0,0.0,VIDRACCO,BFAEB566B3BB78B9ABD4F86DB3A78BFD8489013B,FEBE7F29FB854C477AFA7073C70F0E6EA81C6B00,M,53,4F053D54B9F1B6FE565498E7FA726501FD8FDD18,...,,D4B8928760E729127EB4EB532C86127325FBA468,,False,1.24,,-10.13,,-7.13,
3,3,0.0,0.0,0.0,MEZZOLOMBARDO,CBD30C382C72ED4A3D9DC9D11C13C155930E7C66,9220A3E83BDCE622F70A1456C7898687FD3EDAC8,M,50,115C7D34A79C66AFCDB4EB65262595B197B0F861,...,1.08,BD4DEF66D7EF2D3D683D9C381D2359AD24B41D9C,,False,-0.88,,23.68,,17.66,
4,4,0.0,0.0,0.0,PINETO,4D17C711CADED6C1E9BD459088B4B80F24575FE7,642C21354EF6017D13220517E2A25D4A0442D4C5,M,65,C7734B97A212004CEA25956D8FDFEF068CD2B8F6,...,0.86,C0622AFD22384A4B54CD1ACAF7D988B89853A5AC,,False,14.71,,141.57,,63.59,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,26.0,25.0,33.0,CINTANO,2D244E5D90065985546AEA4D52F54C9705B8038B,102CADC9FC8C745FE3B8C27F7267E07C9691728A,M,55,6D3FDD0A2EEEE1A2C1AC23E6C316048DD2CCE859,...,,53D29ACDEB11F45D4F305A4D00E5F03C1F2CB674,monthly,False,,13.08,,7.62,,15.65
1996,1996,71.0,61.0,88.0,TREZZONE,A01243282C24F561D6B54F1C1FCB45AE80F75B0A,9C3B5B4093FC99569E5BBA918A977681A46F52DF,M,48,52315237533E1A6504E3795EAC1D5586744C62FD,...,,2B8D02A2AD117EC401BCEF3C0651FEEBB6609D1C,monthly,False,,9.20,,8.76,,25.80
1997,1997,37.0,71.0,100.0,RIPATRANSONE,469AB252FF0A2C60B32D0337C933E3565E88C0C2,1B451B6AA312EFF93372D44EC6C24C0110A628A1,F,32,2777855F19667FBFEDEAEF181E7ABC8584262127,...,,A1BFA0E4833E27AB398F4420414E037D250F0A7E,monthly,False,,8.70,,8.66,,24.14
1998,1998,74.0,78.0,92.0,MONSELICE,9E24BB26322BD95D11E26DFB80AA1168C9661FDC,F6BADD2456E283A8DCA6F3FF640724072108BAEF,F,54,8DC8820FB50005ED9CC4E7C187B9703460941856,...,,DED078288677E4E8A15874405E79C26AC78747B0,monthly,False,,10.21,,8.96,,30.43


In [211]:
# voglio trovare le colonne str che diventeranno float che generano problemi
# VIRGOLE DI MERDA
problematiche = [x for x in string_cols if x in final_float_cols]
for p in problematiche:
    print(p)
    df[p] = df[p].str.replace(',', '.').str.split(expand=True).iloc[:,-1].value_counts().index[0]
    print(float(df[p][0]))

extra_fees
0.0
total_amount
0.0
light_amount
nan
howmuch_pay
0.0
gas_amount
nan
tv
0.0


In [208]:
float(0.48)

0.48

In [191]:
# Ho diverse colonne da cui eliminare le unità di misura...
df['gas_average_cost'].dtype

dtype('float64')

In [None]:
# in 'bill_type' False == 'standard bill'
# Funziona solo se la funzione sopra non va in errore.

df = base.replace('bill_type', 'False', 'standard bill', False)
df

In [None]:
#Verifica della bontà della sostituzione
df['bill_type'].value_counts()

In [None]:
# Non c'è nessuna riga tutta nulla, Giulio è contento
df.dropna(how='all')

# 3. STRUCTURING

In [None]:
''' 
+++++ DATA STRUCTURING +++++
    • Change column data types
    • Delete, split or merge columns
    • Pivot and unpivot
'''

## Change Column Data Types 

In [None]:
# aggregabile con lo step precedente
# L'HO FATTO SOPRA, è DA ORDINARE E PERFEZIONARE

## Delete, Split or Merge Columns

In [None]:
# rimozione colonne duplicate 

## Pivot and Unpivot

In [None]:
# non saprei cosa farci

# 4. ENRICHMENT

In [None]:
''' 
+++++ DATA ENRICHMENT +++++
    • Calculate columns using expressions
    • Set primary key column
    • Join or append DataFrames
    • Group by and aggregate records
    • Scale column values into a certain range
    • Encode categorical data (one-hot encoding and label encoding)
'''

## Calculate Columns Using Expressions

## Set PK column

# 5. FILTERING

In [None]:
''' 
+++++ DATA FILTERING +++++
    • Sample rows
    • Select a subset of rows through a user-defined query
'''

## Sample Rows

In [None]:
df.sample(20)

In [None]:
# SELECT A SUBSET OF ROWS?!?

# 6. CLEANING

In [None]:
''' 
+++++ DATA CLEANING +++++
    • Change data format and case
    • Deduplicate data
    • Missing value imputation
    • Find and replace values
'''

In [None]:
# CHANGE DATA FORMAT AND CASE
# data format dovrebbe già essere fatto a questo punto
# case pure se vogliamo aver fatto tutto per bene

In [None]:
# DEDUPLICATE DATA
# controllo se ho più bollette uguali dello stesso cliente

In [None]:
ddf

In [None]:
# MISSING VALUE IMPUTATION
# qua? Spero di averne pochissimi nel caso

In [None]:
# FIND AND REPLACE VALUES
# ossia?

In [None]:
x = np.nan
float(x)