# DataFrame

Struttura dati simile a una tabella in un database o in un foglio Excel. Ogni tabella contiene:
- Colonne identificate per nome
- Righe di dati
- Colonna indice

In [1]:
import pandas as pd

## Creazione di un DataFrame

In [None]:
airports = pd.DataFrame([
    ["Seattle-Tacoma","Seattle","USA" ],
    ["Dulles","Washington","USA"],
    ["London Heathrow", "London", "UK"],
    ["Schiphol", "Amsterdam", "Netherlands"],
])

In [6]:
airports.head()

Unnamed: 0,0,1,2
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,London Heathrow,London,UK
3,Schiphol,Amsterdam,Netherlands


## Crezione di un DataFrame e attribuzione del nome delle colonne

In [10]:
airports = pd.DataFrame(
    [
        ["Seattle-Tacoma","Seattle","USA" ],
        ["Dulles","Washington","USA"],
        ["London Heathrow", "London", "UK"],
        ["Schiphol", "Amsterdam", "Netherlands"],
    ],
    columns=["Name","City","Country"]
)

In [11]:
airports.head()

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,London Heathrow,London,UK
3,Schiphol,Amsterdam,Netherlands


## Accesso agli elementi di un DataFrame

In [12]:
airports.head(2)

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA


In [13]:
airports.tail(2)

Unnamed: 0,Name,City,Country
2,London Heathrow,London,UK
3,Schiphol,Amsterdam,Netherlands


In [15]:
airports.shape

(4, 3)

In [17]:
airports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     4 non-null      object
 1   City     4 non-null      object
 2   Country  4 non-null      object
dtypes: object(3)
memory usage: 224.0+ bytes


## Estrazione di dati da un DataFrame

In [18]:
airports["City"]

0       Seattle
1    Washington
2        London
3     Amsterdam
Name: City, dtype: object

In [19]:
airports[["Name","Country"]]

Unnamed: 0,Name,Country
0,Seattle-Tacoma,USA
1,Dulles,USA
2,London Heathrow,UK
3,Schiphol,Netherlands


In [20]:
airports.iloc[0,0]

'Seattle-Tacoma'

In [21]:
airports.iloc[2,2]

'UK'

In [22]:
airports.iloc[:,:]

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,London Heathrow,London,UK
3,Schiphol,Amsterdam,Netherlands


In [23]:
airports.iloc[0:2,:]

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA


In [24]:
airports.iloc[:,0:2] #equivalente a: airports.iloc[:,0:-1]

Unnamed: 0,Name,City
0,Seattle-Tacoma,Seattle
1,Dulles,Washington
2,London Heathrow,London
3,Schiphol,Amsterdam


In [25]:
airports.iloc[:,[0,2]]

Unnamed: 0,Name,Country
0,Seattle-Tacoma,USA
1,Dulles,USA
2,London Heathrow,UK
3,Schiphol,Netherlands


In [26]:
airports.loc[:,["Name","Country"]]

Unnamed: 0,Name,Country
0,Seattle-Tacoma,USA
1,Dulles,USA
2,London Heathrow,UK
3,Schiphol,Netherlands


## Manipolazione dei file CSV con Pandas

In [None]:
file_path = "Datasets/airports.csv" # Definiamo il percorso del
airports_df = pd.read_csv(file_path)
airports_df.head()

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,London Heathrow,London,United Kingdom
3,Schiphol,Amsterdam,Netherlands


In [None]:
file_path = "Datasets/airportsInvalidRows.csv" # Definiamo il percorso del
airports_df = pd.read_csv(file_path)
airports_df.head()

In [None]:
file_path = "Datasets/airportsNoHeaderRows.csv" # Definiamo il percorso del
airports_df = pd.read_csv(file_path, header=None)
airports_df.head()

Unnamed: 0,0,1,2
0,Name,City,Country
1,Seattle-Tacoma,Seattle,USA
2,Dulles,Washington,USA
3,London Heathrow,London,United Kingdom
4,Schiphol,Amsterdam,Netherlands


In [None]:
file_path = "Datasets/airportsNoHeaderRows.csv" # Definiamo il percorso del
airports_df = pd.read_csv(file_path, header=None, names=["Name", "City", "Country"])
airports_df.head()

Unnamed: 0,Name,City,Country
0,Name,City,Country
1,Seattle-Tacoma,Seattle,USA
2,Dulles,Washington,USA
3,London Heathrow,London,United Kingdom
4,Schiphol,Amsterdam,Netherlands


In [None]:
file_path = "Datasets/airportsBlankValues.csv" # Definiamo il percorso del
airports_df = pd.read_csv(file_path)
airports_df.head()

Unnamed: 0,Name,City,Country
0,Dulles,Washington,USA
1,London Heathrow,,United Kingdom


In [None]:
airports = pd.DataFrame(
    [
        ["Seattle-Tacoma","Seattle","USA" ],
        ["Dulles","Washington","USA"],
        ["London Heathrow", "London", "UK"],
        ["Schiphol", "Amsterdam", "Netherlands"],
    ],
    columns=["Name","City","Country"]
)

airports.to_csv("Datasets/MyDatasets/myAirportsWithIndex.csv") # salvato insieme agli indici
airports.to_csv("Datasets/MyDatasets/myAirportsWithoutIndex.csv", index=False) # salvato senza gli indici
airports.to_csv("Datasets/MyDatasets/myAirportsWithHeaderRow.csv", header=None) # salvato senza intestazione

## Rimozione delle Colonne di un DataFrame

In [37]:
airports = pd.DataFrame(
    [
        ["Seattle-Tacoma","Seattle","USA" ],
        ["Dulles","Washington","USA"],
        ["London Heathrow", "London", "UK"],
        ["Schiphol", "Amsterdam", "Netherlands"],
    ],
    columns=["Name","City", "Country"]
)

cities = airports.drop(columns=["Name"])
cities

Unnamed: 0,City,Country
0,Seattle,USA
1,Washington,USA
2,London,UK
3,Amsterdam,Netherlands


In [38]:
airports

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,London Heathrow,London,UK
3,Schiphol,Amsterdam,Netherlands


In [39]:
airports.drop(columns=["Name"], inplace=True)
airports

Unnamed: 0,City,Country
0,Seattle,USA
1,Washington,USA
2,London,UK
3,Amsterdam,Netherlands


## Gestione dei valori mancanti

In [46]:
nan_value = float("NaN")

airportsWithNaN = pd.DataFrame(
    [
        ["Seattle-Tacoma", "USA" ],
        [pd.NA, "USA"],
        ["London Heathrow", "UK"],
        ["Schiphol", pd.NA],
    ],
    columns=["City", "Country"]
)
airportsWithNaN

Unnamed: 0,City,Country
0,Seattle-Tacoma,USA
1,,USA
2,London Heathrow,UK
3,Schiphol,


In [47]:
airports_cleaned = airportsWithNaN.dropna()
airports_cleaned

Unnamed: 0,City,Country
0,Seattle-Tacoma,USA
2,London Heathrow,UK


In [48]:
airportsWithNaN.dropna(inplace=True)
airportsWithNaN

Unnamed: 0,City,Country
0,Seattle-Tacoma,USA
2,London Heathrow,UK


## Rimozione righe duplicate

In [49]:
airportsDuplicated = pd.DataFrame(
    [
        ["Seattle-Tacoma", "USA" ],
        ["Seattle-Tacoma", "USA"],
        ["London Heathrow", "UK"],
        ["Schiphol", "Netherlands"],
    ],
    columns=["City", "Country"]
)

In [50]:
airportsDuplicated.duplicated()

0    False
1     True
2    False
3    False
dtype: bool

In [52]:
airports_cleaned = airportsDuplicated.drop_duplicates()
airports_cleaned

Unnamed: 0,City,Country
0,Seattle-Tacoma,USA
2,London Heathrow,UK
3,Schiphol,Netherlands
