In [2]:
import pandas as pd

## 1) Data collection and exploration

In [84]:
df = pd.read_csv('invoices.csv', sep=',', decimal='.', encoding='utf-8')
df.head()

Unnamed: 0,first_name,last_name,email,product_id,qty,amount,invoice_date,address,city,stock_code,job
0,Carmen Nixon,Todd Anderson,marvinjackson@example.com,133,9,14.57,10/09/1982,283 Wendy Common,West Alexander,36239634,Logistics and distribution manager
1,Mrs. Heather Miller,Julia Moore,jeffrey84@example.net,155,5,65.48,03/10/2012,13567 Patricia Circles Apt. 751,Andreamouth,2820163,Osteopath
2,Crystal May,Philip Moody,ugoodman@example.com,151,9,24.66,23/03/1976,6389 Debbie Island Suite 470,Coxbury,27006726,Economist
3,Bobby Weber,Mark Scott,ssanchez@example.com,143,4,21.34,17/08/1986,6362 Ashley Plaza Apt. 994,Ninaland,83036521,Sports administrator
4,Kristen Welch,David David,cynthia66@example.net,168,2,83.9,11/06/1996,463 Steven Cliffs Suite 757,Isaiahview,80142652,Chief Marketing Officer


In [50]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
print(f"Info: {df.info()}")

Dataset contains 10000 rows and 11 columns.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   first_name    10000 non-null  object 
 1   last_name     10000 non-null  object 
 2   email         10000 non-null  object 
 3   product_id    10000 non-null  int64  
 4   qty           10000 non-null  int64  
 5   amount        10000 non-null  float64
 6   invoice_date  10000 non-null  object 
 7   address       10000 non-null  object 
 8   city          10000 non-null  object 
 9   stock_code    10000 non-null  int64  
 10  job           10000 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 859.5+ KB
Info: None


a) Treating first name and last name error

In [None]:
# Function to correct the first_name and last_name in a dataset if they are in both

# Input : dataset => Pandas DataFrames, dataset must have a first_name and a last_name columns
# option => String, options for treatment, 
# default "separate": create two new line for each name, 
# "first": keep only the first_name renamed as name, 
# "last" : keep only the last_name renamed as name

# Ouput : dataset => Pandas DataFrames
def name_treatment(dataset: pd.DataFrame,options: str ="separate") -> pd.DataFrame:
    if "first_name" and "last_name" in dataset:

        if options=="separate":
            value = dataset.columns.difference(['first_name','last_name']).tolist()
            new_dataset = pd.melt(dataset, id_vars=value,              
                              value_vars=['first_name', 'last_name'],
                              value_name='name')
            
            autres_colonnes = [col for col in new_dataset.columns if col not in ["name", "variable"]]
            nouvel_ordre = ["name"] + autres_colonnes
            new_dataset = new_dataset[nouvel_ordre]

        elif options=="first":
            new_dataset = dataset.drop(columns=['last_name'])
            new_dataset.rename(columns={'first_name': 'name'})

        elif options=="last":
            new_dataset = dataset.drop(columns=['first_name'])
            new_dataset.rename(columns={'last_name': 'name'})
        else:
            print(f"{options} is not a correct parameters of options, please write 'separate' or 'first' or 'last")
            return dataset
        return new_dataset
    else:
        return dataset

In [85]:
df = name_treatment(df)
df.head()

Unnamed: 0,name,address,amount,city,email,invoice_date,job,product_id,qty,stock_code
0,Carmen Nixon,283 Wendy Common,14.57,West Alexander,marvinjackson@example.com,10/09/1982,Logistics and distribution manager,133,9,36239634
1,Mrs. Heather Miller,13567 Patricia Circles Apt. 751,65.48,Andreamouth,jeffrey84@example.net,03/10/2012,Osteopath,155,5,2820163
2,Crystal May,6389 Debbie Island Suite 470,24.66,Coxbury,ugoodman@example.com,23/03/1976,Economist,151,9,27006726
3,Bobby Weber,6362 Ashley Plaza Apt. 994,21.34,Ninaland,ssanchez@example.com,17/08/1986,Sports administrator,143,4,83036521
4,Kristen Welch,463 Steven Cliffs Suite 757,83.9,Isaiahview,cynthia66@example.net,11/06/1996,Chief Marketing Officer,168,2,80142652
