In [36]:
import pandas as pd
import numpy as np

In [37]:
dictionary = data = {
    'Name': ['Alice', 'Bob', 'Carla', 'Bob', 'Carla', 'Alice', None],
    'Age': [25, 30, 22, 30, np.nan, 25, 29],
    'City': ['Rome', 'Milan', 'Naples', 'Milan', 'Naples', 'Rome', 'Rome']
}

df = pd.DataFrame(dictionary)

print(df)

    Name   Age    City
0  Alice  25.0    Rome
1    Bob  30.0   Milan
2  Carla  22.0  Naples
3    Bob  30.0   Milan
4  Carla   NaN  Naples
5  Alice  25.0    Rome
6   None  29.0    Rome


In [38]:
df.drop_duplicates()  # removes duplicate rows (none in this example)
print(df)

    Name   Age    City
0  Alice  25.0    Rome
1    Bob  30.0   Milan
2  Carla  22.0  Naples
3    Bob  30.0   Milan
4  Carla   NaN  Naples
5  Alice  25.0    Rome
6   None  29.0    Rome


In [39]:
df_cleaned = df.dropna()  # removes rows with at least one missing value (NaN or None)
print(df_cleaned)

    Name   Age    City
0  Alice  25.0    Rome
1    Bob  30.0   Milan
2  Carla  22.0  Naples
3    Bob  30.0   Milan
5  Alice  25.0    Rome


In [40]:
df['Age'].fillna(df['Age'].mean(), inplace=True)  # replaces NA values (not available) with another, inplace=True substitutes the original dataframe
print(df)

    Name        Age    City
0  Alice  25.000000    Rome
1    Bob  30.000000   Milan
2  Carla  22.000000  Naples
3    Bob  30.000000   Milan
4  Carla  26.833333  Naples
5  Alice  25.000000    Rome
6   None  29.000000    Rome


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)  # replaces NA values (not available) with another, inplace=True substitutes the original dataframe


In [41]:
data = {
    'date': ['2021-01-01', '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02'],
    'city': ['Rome', 'Milan', 'Naples', 'Rome', 'Milan'],
    'product': ['mouse', 'keyboard', 'mouse', 'keyboard', 'mouse'],
    'sales': [100, 200, 150, 300, 250]
}
df = pd.DataFrame(data)

In [42]:
pivot_df = df.pivot_table(values='sales',index='product',columns='city',aggfunc='mean')

In [43]:
print(df,'\n')
print(pivot_df)

         date    city   product  sales
0  2021-01-01    Rome     mouse    100
1  2021-01-01   Milan  keyboard    200
2  2021-01-01  Naples     mouse    150
3  2021-01-02    Rome  keyboard    300
4  2021-01-02   Milan     mouse    250 

city      Milan  Naples   Rome
product                       
keyboard  200.0     NaN  300.0
mouse     250.0   150.0  100.0


In [44]:
grouped_df = df.groupby('product').sum()

In [45]:
print(grouped_df)

                                    date             city  sales
product                                                         
keyboard            2021-01-012021-01-02        MilanRome    500
mouse     2021-01-012021-01-012021-01-02  RomeNaplesMilan    500
