In [1]:
import numpy as np
import pandas as pd

In [9]:
df = pd.DataFrame({
    "Sex" : ["M","F","F","D","?"],
    "Age" : [21,30,24,290,26]
})

In [10]:
df

Unnamed: 0,Sex,Age
0,M,21
1,F,30
2,F,24
3,D,290
4,?,26


In [11]:
df["Sex"].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [12]:
df["Sex"].replace("D","F")

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [13]:
df

Unnamed: 0,Sex,Age
0,M,21
1,F,30
2,F,24
3,D,290
4,?,26


In [14]:
df["Sex"].replace({"D" : "F","N" : "M"})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [16]:
df.replace({
    "Sex" : {
        "D" : "F",
        "N" : "M"
    },
    "Age" : {
        290 : 29
    }
})

Unnamed: 0,Sex,Age
0,M,21
1,F,30
2,F,24
3,F,29
4,?,26


In [19]:
df

Unnamed: 0,Sex,Age
0,M,21
1,F,30
2,F,24
3,D,290
4,?,26


In [20]:
df[df["Age"] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [22]:
df.loc[df["Age"] > 100, "Age"] = df.loc[df["Age"] > 100, "Age"] / 10

In [23]:
df

Unnamed: 0,Sex,Age
0,M,21
1,F,30
2,F,24
3,D,29
4,?,26


# Duplicates

In [26]:
ambassadors = pd.Series([
    "France",
    "United Kingdom",
    "United Kingdom",
    "Italy",
    "Germany",
    "Germany",
    "Germany"
], index= [
    "Gerald Araud",
    "Kim Darroch",
    "Peter Westmacott",
    "Armando Varricchio",
    "Peter Wittig",
    "Peter Ammon",
    "Klaus Scharioth"
])

In [27]:
ambassadors

Gerald Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [28]:
ambassadors.duplicated()

Gerald Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [29]:
ambassadors.duplicated(keep="last")

Gerald Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [30]:
ambassadors.duplicated(keep=False)

Gerald Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [31]:
ambassadors.drop_duplicates()

Gerald Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [32]:
ambassadors.drop_duplicates(keep="last")

Gerald Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [33]:
ambassadors.drop_duplicates(keep=False)

Gerald Araud          France
Armando Varricchio     Italy
dtype: object

# Duplicates in DataFrame

In [34]:
players = pd.DataFrame({
    "Name" : [
        "Kobe Bryant",
        "LeBron James",
        "Kobe Bryant",
        "Carmelo Anthony",
        "Kobe Bryant"
    ],
    "Pos" : [
        "SG",
        "SF",
        "SG",
        "SF",
        "SF"
    ]
})

In [35]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [36]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [37]:
players.duplicated(subset=["Name"])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [38]:
players.duplicated(subset=["Name"],keep="last")

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [39]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [40]:
players.drop_duplicates(subset=["Name"])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [41]:
players.drop_duplicates(subset=["Name"],keep="last")

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


# Splitting Columns

In [52]:
df = pd.DataFrame({
    "Data" : [
        "1987_M_US _1",
        "1990?_M_UK_1",
        "1992_F_US_2",
        "1970?_M_   IT_1",
        "1985_F_I T_2"
    ]
})

In [53]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [54]:
df["Data"].str.split("_")

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4       [1985, F, I T, 2]
Name: Data, dtype: object

In [55]:
df["Data"].str.split("_", expand=True)


Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [56]:
df = df["Data"].str.split("_", expand=True)


In [57]:
df.columns = ["Year","Sex","Country","No Children"]

In [58]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [59]:
df["Year"].str.contains("\?")

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [60]:
df["Country"].str.contains("U")

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [61]:
df["Country"].str.strip()

0     US
1     UK
2     US
3     IT
4    I T
Name: Country, dtype: object

In [62]:
df["Country"].str.replace(" ","")

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [66]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [65]:
df["Year"].str.replace(r"(?P<year>\d{4})\?", lambda m: m.group("Year"))

ValueError: Cannot use a callable replacement when regex=False