### Pandas Recap

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
students = pd.Series([1,2,3, 4, 5], index=['I', 'Love', 'You', 'My', 'Mom'])
students

Unnamed: 0,0
I,1
Love,2
You,3
My,4
Mom,5


In [None]:
print(type(students))

<class 'pandas.core.series.Series'>


In [None]:
print(pd.Series())

Series([], dtype: object)


### Creating Seriesing Using Numpy

In [None]:
lis = np.arange(11)
pd.Series(lis, index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

Unnamed: 0,0
1,0
2,1
3,2
4,3
5,4
6,5
7,6
8,7
9,8
10,9


### List Comprehension

In [None]:
ser = pd.Series(range(6), index = [x for x in 'asdfgk'])
ser

Unnamed: 0,0
a,0
s,1
d,2
f,3
g,4
k,5


### Dictionary

In [None]:
dic = {'Banana':42000, 'Apfel':54000, 'Pickle':12000, 'Onion':88000}
products = pd.Series(dic)
products.index.name = 'Products'
products.name = 'Price'
products

Unnamed: 0_level_0,Price
Products,Unnamed: 1_level_1
Banana,42000
Apfel,54000
Pickle,12000
Onion,88000


### Getting Exact Elements

In [None]:
ser[-3:]

Unnamed: 0,0
f,3
g,4
k,5


In [None]:
ser[2:4]

Unnamed: 0,0
d,2
f,3


In [None]:
ser[::-1]

Unnamed: 0,0
k,5
g,4
f,3
d,2
s,1
a,0


Labeling

In [None]:
ser.index.name = 'Index'
ser.name = 'Nums'
ser

Unnamed: 0_level_0,Nums
Index,Unnamed: 1_level_1
a,0
s,1
d,2
f,3
g,4
k,5


In [None]:
score = [90, 99, 88, 94, 78]
student = ['Kaizen', 'Jinhoo', 'Jinwoo', 'Zenizu', 'Bea']
result = pd.Series(score, index = student)
result.index.name = 'Students'
result.name = 'Scores'
result

Unnamed: 0_level_0,Scores
Students,Unnamed: 1_level_1
Kaizen,90
Jinhoo,99
Jinwoo,88
Zenizu,94
Bea,78


In [None]:
Best_Students = result[result > 90]
Best_Students

Unnamed: 0_level_0,Scores
Students,Unnamed: 1_level_1
Jinhoo,99
Zenizu,94


In [None]:
highest = result.loc[result == result.max()]
highest

Unnamed: 0_level_0,Scores
Students,Unnamed: 1_level_1
Jinhoo,99


In [None]:
min = result.loc[result == result.min()]
min

Unnamed: 0_level_0,Scores
Students,Unnamed: 1_level_1
Bea,78


`Index` `Value`

In [None]:
result.index

Index(['Kaizen', 'Jinhoo', 'Jinwoo', 'Zenizu', 'Bea'], dtype='object', name='Students')

In [None]:
result.values

array([90, 99, 88, 94, 78])

--------------------------------------------------------------------------------

In [None]:
avg = sum(result.values)/len(result.index)
print(f'Average Score of the students is {avg}!\n{result}')

Average Score of the students is 89.8!
Students
Kaizen    90
Jinhoo    99
Jinwoo    88
Zenizu    94
Bea       78
Name: Scores, dtype: int64


--------------------------------------------------------------------------------

### Practice Series

- Find the most frequent value in a Series.

- Count how many unique values are in a Series.

- Replace all occurrences of a specific value (like -1) with NaN.

- Apply a custom function (e.g., square or double the values) to every item in the Series.

- Filter the Series to keep only values greater than a given threshold.

- Calculate the mean, median, and standard deviation of a Series.

- Sort the Series in descending order.

- Check which elements are missing (NaN) in a Series.

- Create a Series from a dictionary and print its index and values.

- Use .where() to mask all values below a certain number with NaN.

In [None]:
s = pd.Series([1, 2, 2, 3, 3, 3])
most_common = s.value_counts().idxmax()
most_common

Unnamed: 0,count
3,3
2,2
1,1


In [None]:
new = pd.Series([1, 22, 3, 4, 6, 8, 8, 9, 9])
res = new.unique()
res

array([ 1, 22,  3,  4,  6,  8,  9])

In [None]:
con = new.value_counts()
con[con == 1]

Unnamed: 0,count
1,1
3,1
22,1
6,1
4,1


Replace with nan

In [None]:
four = pd.Series([-1, 2, 3, 4, 5, -1, 3, -1, 5, 6, -1, 6 , 3, 5, -1])
four1 = four.replace(-1, np.nan)
four1

Unnamed: 0,0
0,
1,2.0
2,3.0
3,4.0
4,5.0
5,
6,3.0
7,
8,5.0
9,6.0


Double the Values

In [None]:
dub = pd.Series([1, 2, 3, 4, 5, 6, 7])
sq = dub ** 2
sq

Unnamed: 0,0
0,1
1,4
2,9
3,16
4,25
5,36
6,49


`.mean()`, `.median()`, `.std()`, `.min()`, `.max()`

In [None]:
dub.loc[dub == dub.mean()]

Unnamed: 0,0
3,4


In [None]:
dub.loc[dub == dub.median()]

Unnamed: 0,0
3,4


In [None]:
dub.loc[dub == dub.std()]

Unnamed: 0,0


In [None]:
dub.loc[dub == dub.max()]

Unnamed: 0,0
6,7


In [None]:
dub.loc[dub == dub.min()]

Unnamed: 0,0
0,1


Sort

In [None]:
dub.sort_values(ascending = False)

Unnamed: 0,0
6,7
5,6
4,5
3,4
2,3
1,2
0,1


In [None]:
dub.sort_values(ascending = True)

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7


`Where`

In [None]:
products.where(products == products.max())

Unnamed: 0_level_0,Price
Products,Unnamed: 1_level_1
Banana,
Apfel,
Pickle,
Onion,88000.0


In [None]:
dub.where(dub == dub.max())

Unnamed: 0,0
0,
1,
2,
3,
4,
5,
6,7.0


In [None]:
import pandas as pd

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)

day1    420
day2    380
dtype: int64




---



---



### Intro to Data Frame

`DataFrame`  -  is a complete table.
- Has the same indexes
- Consisting of several Serieses

--------------------------------------------------------------------------------

Data Frame using Dictionary

In [None]:
data = {
    'Name':['Kaizen', 'Jinwoo', 'Jinho', 'Yuka', 'Mitsuya'],
    'Age':[21, 23, 24, 19, 22],
    'Grade':[1, 3, 2, 4, 1],
    'DateBirth':[2001, 2002, 2004, 2008, 1999]
}
students = pd.DataFrame(data)
students

Unnamed: 0,Name,Age,Grade,DateBirth
0,Kaizen,21,1,2001
1,Jinwoo,23,3,2002
2,Jinho,24,2,2004
3,Yuka,19,4,2008
4,Mitsuya,22,1,1999


`keys` are names of the columns and the values are the rows

--------------------------------------------------------------------------------

In [None]:
data2 = [
    {'name':'Samina', 'age':15},
    {'name':'Kaizen', 'age':17}
]
couple = pd.DataFrame(data2, index = [1, 2])
couple

Unnamed: 0,name,age
1,Samina,15
2,Kaizen,17




---



Functions that helps to understand a DataFrame Better

`.head()`, `.info()`, `.describe()`

In [None]:
students.head(2)

Unnamed: 0,Name,Age,Grade,DateBirth
0,Kaizen,21,1,2001
1,Jinwoo,23,3,2002


In [None]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       5 non-null      object
 1   Age        5 non-null      int64 
 2   Grade      5 non-null      int64 
 3   DateBirth  5 non-null      int64 
dtypes: int64(3), object(1)
memory usage: 292.0+ bytes


In [None]:
m = students.describe()
m

Unnamed: 0,Age,Grade,DateBirth
count,5.0,5.0,5.0
mean,21.8,2.2,2002.8
std,1.923538,1.30384,3.420526
min,19.0,1.0,1999.0
25%,21.0,1.0,2001.0
50%,22.0,2.0,2002.0
75%,23.0,3.0,2004.0
max,24.0,4.0,2008.0


In [None]:
max_age = m.iloc[7, 0]
age = max_age.astype(np.int8)
print(f'Max age in the list of students is {age}')

Max age in the list of students is 24


### DF Practice

In [None]:
temp = [12, 25, 35, 41, 15]
ind = ['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag']

days = pd.Series(temp, index = ind)
days.index.name = 'Wochentaglied'
days.name = 'Temperatur'
des = days.describe()

In [None]:
print(days.loc[days.max() == days])

Wochentaglied
Donnerstag    41
Name: Temperatur, dtype: int64


In [None]:
print(f'Temperatur fur die nachste Woche! \n{days}\n-----------------')
print(f'\nEine kurze Informazionen von dieses Woche{days.info()}\n-----------------')
print(f'\nDann, kurze Analyse\n{des}')

Temperatur fur die nachste Woche! 
Wochentaglied
Montag        12
Dienstag      25
Mittwoch      35
Donnerstag    41
Freitag       15
Name: Temperatur, dtype: int64
-----------------
<class 'pandas.core.series.Series'>
Index: 5 entries, Montag to Freitag
Series name: Temperatur
Non-Null Count  Dtype
--------------  -----
5 non-null      int64
dtypes: int64(1)
memory usage: 252.0+ bytes

Eine kurze Informazionen von dieses WocheNone
-----------------

Dann, kurze Analyse
count     5.000000
mean     25.600000
std      12.481987
min      12.000000
25%      15.000000
50%      25.000000
75%      35.000000
max      41.000000
Name: Temperatur, dtype: float64


`Temp` - is a list containing temperature over the last five days

`ind(index)` - is also a list with names of the the days except weekend days

`Days` - is a variable keepping Series dtype



---



In [None]:
stud = {
    'name':['Yuka', 'Levi', 'Gojo', 'Eren'],
    'score':[90, 87, 89, 97],
    'sex':['F', 'T', 'T', 'T']
}
dfs = pd.DataFrame(stud, index = [1, 4, 2, 3])
dfs.index.name = 'Grade'
dec = dfs.describe()

In [None]:
dfs['passed'] = dfs['score'].apply(lambda x: 'Yeas' if x > 90 else 'No')
dfs

Unnamed: 0_level_0,name,score,sex,passed
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Yuka,90,F,No
4,Levi,87,T,No
2,Gojo,89,T,No
3,Eren,97,T,Yeas


In [None]:
dfs['passed'] = dfs['score'] > 90
dfs

Unnamed: 0_level_0,name,score,sex,passed
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Yuka,90,F,False
4,Levi,87,T,False
2,Gojo,89,T,False
3,Eren,97,T,True


In [None]:
dfs[dfs['score'] > 80]

Unnamed: 0_level_0,name,score,sex
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Yuka,90,F
4,Levi,87,T
2,Gojo,89,T
3,Eren,97,T


`Stud` - is a dict with keys name, score and sex which will act in df as a names of the columns

`dfs(DataFrameStudents)` - variable getting to inself stud and indexes that work as students grades

In [None]:
print(f'The best students from differend grads\n{dfs}\n----------------------------------')
print(f'Short info about studets table{dfs.info()}\n----------------------------------')
print(f'Short analysis related to their scores\n{dec}\n----------------------------------')

The best students from differend grads
       name  score sex
Grade                 
1      Yuka     90   F
4      Levi     87   T
2      Gojo     89   T
3      Eren     97   T
----------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 1 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   score   4 non-null      int64 
 2   sex     4 non-null      object
dtypes: int64(1), object(2)
memory usage: 128.0+ bytes
Short info about studets tableNone
----------------------------------
Short analysis related to their scores
           score
count   4.000000
mean   90.750000
std     4.349329
min    87.000000
25%    88.500000
50%    89.500000
75%    91.750000
max    97.000000
----------------------------------




---



In [None]:
sch = {
    'Tasks':['Wake up', 'Streching', 'Breakfast', 'Mathematics', 'Najot'],
    'Time':['7:30', '8:00', '8:30', '15:30', '21:00'],
    'Importance':['Medium', 'High', 'Medium', 'High', 'High']
}

my_sch = pd.DataFrame(sch)
sch_des = my_sch.describe()

In [None]:
print(f'My weekday schedule\n{my_sch}\n----------------------------------')
print(f'Short info about my sch{my_sch.info()}\n----------------------------------')
print(f'Arifmetics\n{sch_des}\n----------------------------------')

My weekday schedule
         Tasks   Time Importance
0      Wake up   7:30     Medium
1    Streching   8:00       High
2    Breakfast   8:30     Medium
3  Mathematics  15:30       High
4        Najot  21:00       High
----------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Tasks       5 non-null      object
 1   Time        5 non-null      object
 2   Importance  5 non-null      object
dtypes: object(3)
memory usage: 252.0+ bytes
Short info about my schNone
----------------------------------
Arifmetics
          Tasks  Time Importance
count         5     5          5
unique        5     5          2
top     Wake up  7:30       High
freq          1     1          3
----------------------------------




---



### Bracket   and   Dot

In [None]:
dfs['name']   # bracket

Unnamed: 0_level_0,name
Grade,Unnamed: 1_level_1
1,Yuka
4,Levi
2,Gojo
3,Eren


In [None]:
dfs.name   #Dot notation

Unnamed: 0_level_0,name
Grade,Unnamed: 1_level_1
1,Yuka
4,Levi
2,Gojo
3,Eren


Dot notaion is fatster, but bracket is more reliable(even there are extra spaces)

In [None]:
print(dfs[['name']])        # dtype is a Data Frame
print(type(dfs[['name']]))

       name
Grade      
1      Yuka
4      Levi
2      Gojo
3      Eren
<class 'pandas.core.frame.DataFrame'>


In [None]:
print(dfs['name'])          # dtype is a Series
print(type(dfs['name']))

Grade
1    Yuka
4    Levi
2    Gojo
3    Eren
Name: name, dtype: object
<class 'pandas.core.series.Series'>




---



In [None]:
data = {
    'Customer ID': ['C123', 'C234', 'C345', 'C456', 'C567'],
    'Name': ['John Doe', 'Petra Müller', 'Ali Khan', 'Maria Gonzalez', 'David Lee'],
    'Country': ['United States', 'Germany', 'Pakistan', 'Mexico', 'China'],
    'Region': ['North America', 'Europe', 'Asia', 'North America', 'Asia'],
    'Age': [67, 51, 19, 26, 40]
}

df = pd.DataFrame(data)
df.set_index('Customer ID', inplace=True)
df

Unnamed: 0_level_0,Name,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C123,John Doe,United States,North America,67
C234,Petra Müller,Germany,Europe,51
C345,Ali Khan,Pakistan,Asia,19
C456,Maria Gonzalez,Mexico,North America,26
C567,David Lee,China,Asia,40


### Selecting a Row

`.loc`['`row name`', '`column name`']

One row

In [None]:
df.loc['C345']

Unnamed: 0,C345
Name,Ali Khan
Country,Pakistan
Region,Asia
Age,19


Several Rows

In [None]:
df.loc['C345':'C456']

Unnamed: 0_level_0,Name,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C345,Ali Khan,Pakistan,Asia,19
C456,Maria Gonzalez,Mexico,North America,26


Slice

In [None]:
df.loc['C123':'C456']

Unnamed: 0_level_0,Name,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C123,John Doe,United States,North America,67
C234,Petra Müller,Germany,Europe,51
C345,Ali Khan,Pakistan,Asia,19
C456,Maria Gonzalez,Mexico,North America,26


Filltered rows

In [None]:
df.loc[df['Region'] != 'Asia']

Unnamed: 0_level_0,Name,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C123,John Doe,United States,North America,67
C234,Petra Müller,Germany,Europe,51
C456,Maria Gonzalez,Mexico,North America,26


### Sellection a Column

In [None]:
df.loc[:, 'Name']

Unnamed: 0_level_0,Name
Customer ID,Unnamed: 1_level_1
C123,John Doe
C234,Petra Müller
C345,Ali Khan
C456,Maria Gonzalez
C567,David Lee


Several rows

In [None]:
df.loc[:, ['Name', 'Age']]

Unnamed: 0_level_0,Name,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C123,John Doe,67
C234,Petra Müller,51
C345,Ali Khan,19
C456,Maria Gonzalez,26
C567,David Lee,40


Row + Column

In [None]:
df.loc[df['Age'] > 30, 'Name':'Region']

Unnamed: 0_level_0,Name,Country,Region
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C123,John Doe,United States,North America
C234,Petra Müller,Germany,Europe
C567,David Lee,China,Asia




---



`iloc` - sellection according to the position(num)

.`iloc`['`row index`', '`column index`']

In [None]:
df.iloc[0]

Unnamed: 0,C123
Name,John Doe
Country,United States
Region,North America
Age,67


Several rows

In [None]:
df.iloc[[0, 3, 4]]

Unnamed: 0_level_0,Name,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C123,John Doe,United States,North America,67
C456,Maria Gonzalez,Mexico,North America,26
C567,David Lee,China,Asia,40


Slice

In [None]:
df.iloc[1:4]

Unnamed: 0_level_0,Name,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C234,Petra Müller,Germany,Europe,51
C345,Ali Khan,Pakistan,Asia,19
C456,Maria Gonzalez,Mexico,North America,26


Exact colimn

In [None]:
df.iloc[:, 2]

Unnamed: 0_level_0,Region
Customer ID,Unnamed: 1_level_1
C123,North America
C234,Europe
C345,Asia
C456,North America
C567,Asia


Several Rows

In [None]:
df.iloc[:, [0, 3]]

Unnamed: 0_level_0,Name,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C123,John Doe,67
C234,Petra Müller,51
C345,Ali Khan,19
C456,Maria Gonzalez,26
C567,David Lee,40


Slice between columns

In [None]:
df.iloc[:, 1:3]

Unnamed: 0_level_0,Country,Region
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C123,United States,North America
C234,Germany,Europe
C345,Pakistan,Asia
C456,Mexico,North America
C567,China,Asia


Sellection of the Rows and Columns

In [None]:
df.iloc[[0, 1, 4], 1:4]

Unnamed: 0_level_0,Country,Region,Age
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C123,United States,North America,67
C234,Germany,Europe,51
C567,China,Asia,40




---



### Changin the Name of the columns

In [None]:
renamed = df.rename(columns = {'A':'Alpha', 'B':'Beta'})




---



### Practice `loc`, `iloc`, `MathOp`

In [None]:
import pandas as pd

In [None]:
lis = {
    'Day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'Steps':[5234, 6890, 7321, 8102, 6450, 10200, 3000],
    'Calories':[2100, 2300, 2500, 2600, 2400, 2800, 2000],
    'Hours_Slept':[6.5, 7.0, 5.5, 8.0, 7.5, 9.0, 4.0],
    'Heart_Rate':[78, 75, 82, 70, 74, 72, 85]
}

ft = pd.DataFrame(lis)
ft

Unnamed: 0,Day,Steps,Calories,Hours_Slept,Heart_Rate
0,Monday,5234,2100,6.5,78
1,Tuesday,6890,2300,7.0,75
2,Wednesday,7321,2500,5.5,82
3,Thursday,8102,2600,8.0,70
4,Friday,6450,2400,7.5,74
5,Saturday,10200,2800,9.0,72
6,Sunday,3000,2000,4.0,85


### .iloc Conditions

- Select the last 2 rows and first 3 columns.

In [None]:
ft.iloc[[-1, -2], 1:4]

Unnamed: 0,Steps,Calories,Hours_Slept
6,3000,2000,4.0
5,10200,2800,9.0


- Get the values of rows 1 to 4 (inclusive) and columns 2 and 3.

In [None]:
ft.iloc[[1, 2, 3, 4], 3:5]

Unnamed: 0,Hours_Slept,Heart_Rate
1,7.0,75
2,5.5,82
3,8.0,70
4,7.5,74


- Retrieve the Heart_Rate values for the middle 3 days using index positions.

In [None]:
ft.iloc[2:5, -1]

Unnamed: 0,Heart_Rate
2,82
3,70
4,74


- Select every other row starting from the first row.

In [None]:
ft.iloc[1:]

Unnamed: 0,Day,Steps,Calories,Hours_Slept,Heart_Rate
1,Tuesday,6890,2300,7.0,75
2,Wednesday,7321,2500,5.5,82
3,Thursday,8102,2600,8.0,70
4,Friday,6450,2400,7.5,74
5,Saturday,10200,2800,9.0,72
6,Sunday,3000,2000,4.0,85


### Challenging .iloc Tasks

- Select every second row starting from the second row and get columns 1 to 3.

In [None]:
ft.iloc[2::2, 1:4]

Unnamed: 0,Steps,Calories,Hours_Slept
2,7321,2500,5.5
4,6450,2400,7.5
6,3000,2000,4.0


- Reverse the entire DataFrame using .iloc (both rows and columns).

In [None]:
ft.iloc[::-1, ::-1]

Unnamed: 0,Heart_Rate,Hours_Slept,Calories,Steps,Day
6,85,4.0,2000,3000,Sunday
5,72,9.0,2800,10200,Saturday
4,74,7.5,2400,6450,Friday
3,70,8.0,2600,8102,Thursday
2,82,5.5,2500,7321,Wednesday
1,75,7.0,2300,6890,Tuesday
0,78,6.5,2100,5234,Monday


- Get the last 3 rows, but exclude the first and last columns using only .iloc.

In [None]:
ft.iloc[-3:, 1:7]

Unnamed: 0,Steps,Calories,Hours_Slept,Heart_Rate
4,6450,2400,7.5,74
5,10200,2800,9.0,72
6,3000,2000,4.0,85


- Using .iloc, replace the middle row's second column value with the average of all values in that column.

In [None]:
import numpy as np

In [None]:
mstep = np.mean(ft.iloc[:, 1])   # we get hear an average of all elements in column steps
ft.iloc[3, 1] = mstep            # replace middle element with mean
ft

  ft.iloc[3, 1] = mstep            # replace middle element with mean


Unnamed: 0,Day,Steps,Calories,Hours_Slept,Heart_Rate
0,Monday,5234.0,2100,6.5,78
1,Tuesday,6890.0,2300,7.0,75
2,Wednesday,7321.0,2500,5.5,82
3,Thursday,6742.428571,2600,8.0,70
4,Friday,6450.0,2400,7.5,74
5,Saturday,10200.0,2800,9.0,72
6,Sunday,3000.0,2000,4.0,85


In [None]:
ft.iloc[:, 1] = round(ft.iloc[:, 1])

In [None]:
ft

Unnamed: 0,Day,Steps,Calories,Hours_Slept,Heart_Rate
0,Monday,5234.0,2100,6.5,78
1,Tuesday,6890.0,2300,7.0,75
2,Wednesday,7321.0,2500,5.5,82
3,Thursday,6742.0,2600,8.0,70
4,Friday,6450.0,2400,7.5,74
5,Saturday,10200.0,2800,9.0,72
6,Sunday,3000.0,2000,4.0,85


### .loc Conditions

- Select all data for 'Wednesday' and 'Friday'.

In [None]:
lis = {
    'Day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'Steps':[5234, 6890, 7321, 8102, 6450, 10200, 3000],
    'Calories':[2100, 2300, 2500, 2600, 2400, 2800, 2000],
    'Hours_Slept':[6.5, 7.0, 5.5, 8.0, 7.5, 9.0, 4.0],
    'Heart_Rate':[78, 75, 82, 70, 74, 72, 85]
}

fd = pd.DataFrame(lis)
fd.set_index('Day', inplace = True)
fd

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,5234,2100,6.5,78
Tuesday,6890,2300,7.0,75
Wednesday,7321,2500,5.5,82
Thursday,8102,2600,8.0,70
Friday,6450,2400,7.5,74
Saturday,10200,2800,9.0,72
Sunday,3000,2000,4.0,85


In [None]:
fd.loc[['Wednesday', 'Friday'], :]

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Wednesday,7321,2500,5.5,82
Friday,6450,2400,7.5,74


- Get rows where 'Hours_Slept' is more than 7.

In [None]:
fd.loc[fd['Hours_Slept'] > 7]

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thursday,8102,2600,8.0,70
Friday,6450,2400,7.5,74
Saturday,10200,2800,9.0,72


- Select rows where 'Calories' is less than 2500 and return only 'Steps' and 'Heart_Rate'.

In [None]:
cal = fd.loc[fd['Calories'] < 2500, ['Steps', 'Heart_Rate']]
cal

Unnamed: 0_level_0,Steps,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Monday,5234,78
Tuesday,6890,75
Friday,6450,74
Sunday,3000,85


- Get the 'Day' and 'Calories' where 'Steps' are above 8000.

In [None]:
dcs = fd.loc[fd['Steps'] > 8000, ['Calories']]
dcs

Unnamed: 0_level_0,Calories
Day,Unnamed: 1_level_1
Thursday,2600
Saturday,2800


### Challenging .loc Tasks

- Select the rows where Heart_Rate is greater than the mean Heart_Rate, and show only 'Day' and 'Heart_Rate'.

In [None]:
hr = fd.loc[fd['Heart_Rate'] > fd['Heart_Rate'].mean(), 'Heart_Rate']
hr

Unnamed: 0_level_0,Heart_Rate
Day,Unnamed: 1_level_1
Monday,78
Wednesday,82
Sunday,85


In [None]:
round(fd['Heart_Rate'].mean())

77



---



- From the DataFrame, select all rows where Steps > 6000 and Calories < 2500, show only 'Day', 'Steps', and 'Calories'.

| -- OR  and & -- AND

In [None]:
step_cal = fd.loc[(fd['Steps'] > 6000) & (fd['Calories'] < 2500), ['Steps', 'Calories']]
step_cal

Unnamed: 0_level_0,Steps,Calories
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Tuesday,6890,2300
Friday,6450,2400




---



- Update all rows where Hours_Slept < 6 — set 'Heart_Rate' to 'NaN'.

In [None]:
fd.loc[fd['Hours_Slept'] < 6, 'Hours_Slept'] = 'Nan'

  fd.loc[fd['Hours_Slept'] < 6, 'Hours_Slept'] = 'Nan'


In [None]:
fd

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,5234,2100,6.5,78
Tuesday,6890,2300,7.0,75
Wednesday,7321,2500,Nan,82
Thursday,8102,2600,8.0,70
Friday,6450,2400,7.5,74
Saturday,10200,2800,9.0,72
Sunday,3000,2000,Nan,85




---



-
Using .loc, set all Steps values below the median to 0.

In [None]:
fd['Steps'].median()

6890.0

In [None]:
fd.loc[fd['Steps'] < fd['Steps'].median()] = 0

In [None]:
fd

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,0,0,0,0
Tuesday,6890,2300,7.0,75
Wednesday,7321,2500,Nan,82
Thursday,8102,2600,8.0,70
Friday,0,0,0,0
Saturday,10200,2800,9.0,72
Sunday,0,0,0,0


###  Combined (loc/iloc + Math) Conditions

- Replace 'Heart_Rate' values over 80 with the average heart rate.

In [None]:
avg = fd['Heart_Rate'].mean()
avg = round(avg)
avg

43

In [None]:
fd.loc[fd['Heart_Rate'] > 80] = avg
fd

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,0,0,0.0,0
Tuesday,6890,2300,7.0,75
Wednesday,43,43,43.0,43
Thursday,8102,2600,8.0,70
Friday,0,0,0.0,0
Saturday,10200,2800,9.0,72
Sunday,0,0,0.0,0




---



- Add a new column 'Sleep_Efficiency' = Steps / Hours_Slept.

In [None]:
lis = {
    'Day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    'Steps':[5234, 6890, 7321, 8102, 6450, 10200, 3000],
    'Calories':[2100, 2300, 2500, 2600, 2400, 2800, 2000],
    'Hours_Slept':[6.5, 7.0, 5.5, 8.0, 7.5, 9.0, 4.0],
    'Heart_Rate':[78, 75, 82, 70, 74, 72, 85]
}

fs = pd.DataFrame(lis)
fs.set_index('Day', inplace = True)
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,5234,2100,6.5,78
Tuesday,6890,2300,7.0,75
Wednesday,7321,2500,5.5,82
Thursday,8102,2600,8.0,70
Friday,6450,2400,7.5,74
Saturday,10200,2800,9.0,72
Sunday,3000,2000,4.0,85


In [None]:
div = fs['Steps'] / fs['Hours_Slept']
div = round(div)
div

Unnamed: 0_level_0,0
Day,Unnamed: 1_level_1
Monday,805.0
Tuesday,984.0
Wednesday,1331.0
Thursday,1013.0
Friday,860.0
Saturday,1133.0
Sunday,750.0


In [None]:
fs['Sleep_Efficiency'] = div
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate,Sleep_Efficiency
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Monday,5234,2100,6.5,78,805.0
Tuesday,6890,2300,7.0,75,984.0
Wednesday,7321,2500,5.5,82,1331.0
Thursday,8102,2600,8.0,70,1013.0
Friday,6450,2400,7.5,74,860.0
Saturday,10200,2800,9.0,72,1133.0
Sunday,3000,2000,4.0,85,750.0




---



- Normalize 'Calories' by dividing all values by the max calorie value.

In [None]:
max_cal = fs['Calories'].max()
max_cal

2800

In [None]:
fs['Cal_Normalaize'] = round(fs['Calories'] / max_cal, 2)
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate,Sleep_Efficiency,Cal_Normalaize
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Monday,5234,2100,6.5,78,805.0,0.75
Tuesday,6890,2300,7.0,75,984.0,0.82
Wednesday,7321,2500,5.5,82,1331.0,0.89
Thursday,8102,2600,8.0,70,1013.0,0.93
Friday,6450,2400,7.5,74,860.0,0.86
Saturday,10200,2800,9.0,72,1133.0,1.0
Sunday,3000,2000,4.0,85,750.0,0.71




---



- Subtract the mean 'Heart_Rate' from all 'Heart_Rate' values.

In [None]:
mean_h = round(fs['Heart_Rate'].mean())
mean_h

77

In [None]:
subs = np.subtract(fs['Heart_Rate'], mean_h)

In [None]:
fs['Heart_Rate'] = subs
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate,Sleep_Efficiency,Cal_Normalaize
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Monday,5234,2100,6.5,1,805.0,0.75
Tuesday,6890,2300,7.0,-2,984.0,0.82
Wednesday,7321,2500,5.5,5,1331.0,0.89
Thursday,8102,2600,8.0,-7,1013.0,0.93
Friday,6450,2400,7.5,-3,860.0,0.86
Saturday,10200,2800,9.0,-5,1133.0,1.0
Sunday,3000,2000,4.0,8,750.0,0.71




---



- Add a new column 'Fatigue_Index' defined as:

In [None]:
cal_step = fs['Calories'] / fs['Steps']
h_slep = 8 - fs['Hours_Slept']
com = cal_step * h_slep
com

Unnamed: 0_level_0,0
Day,Unnamed: 1_level_1
Monday,0.601834
Tuesday,0.333817
Wednesday,0.853709
Thursday,0.0
Friday,0.186047
Saturday,-0.27451
Sunday,2.666667


In [None]:
fs['Fatigue_Index'] = round(com)

In [None]:
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate,Sleep_Efficiency,Cal_Normalaize,Fatigue_Index,Step_Normalize
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,5234,2100,6.5,1,805.0,0.75,1.0,-1.0
Tuesday,6890,2300,7.0,-2,984.0,0.82,0.0,0.0
Wednesday,7321,2500,5.5,5,1331.0,0.89,1.0,0.0
Thursday,8102,2600,8.0,-7,1013.0,0.93,0.0,1.0
Friday,6450,2400,7.5,-3,860.0,0.86,0.0,-0.0
Saturday,10200,2800,9.0,-5,1133.0,1.0,-0.0,2.0
Sunday,3000,2000,4.0,8,750.0,0.71,3.0,-2.0




---



- Using .iloc, get all 'Heart_Rate' values in the top 3 days by 'Steps', and - return their mean.+

In [None]:
hmean = round(fs.iloc[-3:, 3].mean())

In [None]:
hmean

0



---



- Normalize the 'Steps' column using Z-score normalization:

In [None]:
StepMean = round(fs['Steps'].mean())
StepStd = round(fs['Steps'].std())

In [None]:
StepMean

6742

In [None]:
StepStd

2256

In [None]:
fs['Step_Normalize'] = round((fs['Steps'] - StepMean) / StepStd)
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate,Sleep_Efficiency,Cal_Normalaize,Fatigue_Index,Step_Normalize
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,5234,2100,6.5,1,805.0,0.75,1.0,-1.0
Tuesday,6890,2300,7.0,-2,984.0,0.82,0.0,0.0
Wednesday,7321,2500,5.5,5,1331.0,0.89,1.0,0.0
Thursday,8102,2600,8.0,-7,1013.0,0.93,0.0,1.0
Friday,6450,2400,7.5,-3,860.0,0.86,0.0,-0.0
Saturday,10200,2800,9.0,-5,1133.0,1.0,-0.0,2.0
Sunday,3000,2000,4.0,8,750.0,0.71,3.0,-2.0




---



- Using .loc, find all days where Heart_Rate is max or min, and increase their 'Hours_Slept' by 1

In [None]:
fs.loc[fs['Heart_Rate'] == fs['Heart_Rate'].max(), 'Hours_Slept'] += 1
fs.loc[fs['Heart_Rate'] == fs['Heart_Rate'].min(), 'Hours_Slept'] += 1

In [None]:
fs

Unnamed: 0_level_0,Steps,Calories,Hours_Slept,Heart_Rate,Sleep_Efficiency,Cal_Normalaize,Fatigue_Index,Step_Normalize
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,5234,2100,6.5,1,805.0,0.75,1.0,-1.0
Tuesday,6890,2300,7.0,-2,984.0,0.82,0.0,0.0
Wednesday,7321,2500,5.5,5,1331.0,0.89,1.0,0.0
Thursday,8102,2600,9.0,-7,1013.0,0.93,0.0,1.0
Friday,6450,2400,7.5,-3,860.0,0.86,0.0,-0.0
Saturday,10200,2800,9.0,-5,1133.0,1.0,-0.0,2.0
Sunday,3000,2000,5.0,8,750.0,0.71,3.0,-2.0




---



---



### Data Cleaning | NaN

Changing column name |    `.rename()`  `.columns()`

In [None]:
import pandas as pd

certificates_earned = pd.Series(
    [8, 2, 5, 6],
    index=['Tom', 'Kris', 'Ahmad', 'Beau']
)

print(certificates_earned)

Tom      8
Kris     2
Ahmad    5
Beau     6
dtype: int64


In [None]:
certificates_earned.columns = ['Certificates', 'Time']
certificates_earned

Unnamed: 0,Certificates,Time
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


`inplace = True`  makes changes to the real DataFrame

In [None]:
certificates_earned.rename(columns = {'Certificates':'CerTif'}, inplace = True)
certificates_earned

Unnamed: 0,CerTif,Time
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


In [None]:
certificates_earned.columns = certificates_earned.columns.str.upper()
certificates_earned

Unnamed: 0,CERTIF,TIME
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


In [None]:
certificates_earned.columns = certificates_earned.columns.str.lower()
certificates_earned

Unnamed: 0,certif,time
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


In [None]:
certificates_earned.columns = certificates_earned.columns.str.title()
certificates_earned

Unnamed: 0,Certif,Time
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12




---



In [None]:
import pandas as pd
import numpy as np

cer = pd.DataFrame({
    'Certificates': [8, 2, np.nan, 6, 5],
    'Time': [np.nan, 5, np.nan, 12, 32],
    'Topic':['Database', np.nan, 'Data Analysis', 'Hacking', np.nan]
})

cer.index = ['Tom', 'Kris', 'Ahmad', 'Beau', 'Kaizen']


cer

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,,Database
Kris,2.0,5.0,
Ahmad,,,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,


`.isnull` returns True if the value is `NaN`

In [None]:
cer.isnull()

Unnamed: 0,Certificates,Time,Topic
Tom,False,True,False
Kris,False,False,True
Ahmad,True,True,False
Beau,False,False,False
Kaizen,False,False,True


Amount of NaN values by each column

In [None]:
cer.isnull().sum()

Unnamed: 0,0
Certificates,1
Time,2
Topic,2


`.dropna()` drops all the rows containing `NaN`

In [None]:
LuckNan = cer.dropna(axis=0)  # axis = 1 works by column, axis = 0 works by row
LuckNan

Unnamed: 0,Certificates,Time,Topic
Beau,6.0,12.0,Hacking


`.dropna(how = 'all')`

In [None]:
new = cer.dropna(how = 'all')
new1 = cer.dropna(how = 'any')  # works by default, deltes all nans
new

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,,Database
Kris,2.0,5.0,
Ahmad,,,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,


In [None]:
new1

Unnamed: 0,Certificates,Time,Topic
Beau,6.0,12.0,Hacking


The thresh parameter in `dropna()` is used to **specify the minimum number of non-NA `(non-null)` values a row or column must have to be kept. If the row or column has fewer non-null values than the threshold, it gets dropped.

In [None]:
cer.dropna(thresh = 2)

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,,Database
Kris,2.0,5.0,
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,


In [None]:
cer.dropna(thresh=3, axis = 'index')  # axis = 0

Unnamed: 0,Certificates,Time,Topic
Beau,6.0,12.0,Hacking


In [None]:
cer.dropna(thresh=1, axis = 'columns') # axis = 1

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,,Database
Kris,2.0,5.0,
Ahmad,,,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,


`.fillna()` fills cells where NaN with the inputed element

In [None]:
filled = cer.fillna(0)
filled

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,0.0,Database
Kris,2.0,5.0,0
Ahmad,0.0,0.0,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,0


In [None]:
filled1 = cer.fillna(cer['Certificates'].mean())
filled1

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,5.25,Database
Kris,2.0,5.0,5.25
Ahmad,5.25,5.25,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,5.25


In [None]:
cer.fillna('I love you')

Unnamed: 0,Certificates,Time,Topic
Tom,8.0,I love you,Database
Kris,2.0,5.0,I love you
Ahmad,I love you,I love you,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,I love you


`.bfill` --   filling  back

`.ffull` --   filling forward

In [None]:
cer.fillna(method = 'ffill')   # null cells are filled with the higher standing cell value

  cer.fillna(method = 'ffill')   # null cells are filled with the higher standing cell value


Unnamed: 0,Certificates,Time,Topic
Tom,8.0,,Database
Kris,2.0,5.0,Database
Ahmad,2.0,5.0,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,Hacking


In [None]:
cer.fillna(method = 'bfill', axis = 0) # null cells are filled with the lower standing cell value

  cer.fillna(method = 'bfill', axis = 0) # null cells are filled with the lower standing cell value


Unnamed: 0,Certificates,Time,Topic
Tom,8.0,5.0,Database
Kris,2.0,5.0,Data Analysis
Ahmad,6.0,12.0,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,


In [None]:
cer.fillna(method = 'bfill', axis = 1) #filled from the side

  cer.fillna(method = 'bfill', axis = 1) #
  cer.fillna(method = 'bfill', axis = 1) #


Unnamed: 0,Certificates,Time,Topic
Tom,8.0,Database,Database
Kris,2.0,5.0,
Ahmad,Data Analysis,Data Analysis,Data Analysis
Beau,6.0,12.0,Hacking
Kaizen,5.0,32.0,


In [None]:
(cer['Time'] > 1).any()  # any o them is nan

np.True_

In [None]:
(cer['Time'] > 1).all()  # all of them are nan

np.False_

`.unique values`

In [None]:
cer['Topic'].unique()

array(['Database', nan, 'Data Analysis', 'Hacking'], dtype=object)

In [None]:
new = pd.DataFrame({'Class':['A', 'A', 'F', 'D', 'L', 'L']}, index = ['Kaizen', 'Jinwoo', 'Sojunte', 'Hunhim', 'Yuka', 'Shin'])
new

Unnamed: 0,Class
Kaizen,A
Jinwoo,A
Sojunte,F
Hunhim,D
Yuka,L
Shin,L


In [None]:
new['Class'].unique()

array(['A', 'F', 'D', 'L'], dtype=object)

### Dealing with Duplicates

`.duplicated()`

In [None]:
import pandas as pd
res = {
    'name':['Kaizen', 'Jinwoo', 'Jinhoo', 'Yuka', 'Itsuya', 'Shin'],
    'score':[4, 3, 4, 5, 1, 3]
}
df = pd.DataFrame(res)
df.set_index('name', inplace =True)
df

Unnamed: 0_level_0,score
name,Unnamed: 1_level_1
Kaizen,4
Jinwoo,3
Jinhoo,4
Yuka,5
Itsuya,1
Shin,3


`.duplicated()` returns True for an elements that already been seen before

In [None]:
df.duplicated()

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kaizen,False
Jinwoo,False
Jinhoo,True
Yuka,False
Itsuya,False
Shin,True


`df.duplicated(keep = 'last')` --  keep last means, the last duplicate elements will not be matched as `True` but first matchings will become `True`

In [None]:
df.duplicated(keep = 'last')

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kaizen,True
Jinwoo,True
Jinhoo,False
Yuka,False
Itsuya,False
Shin,False


`df.duplicated(keep = False)`  -- matches any duplicate element as True

In [None]:
df.duplicated(keep = False)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kaizen,True
Jinwoo,True
Jinhoo,True
Yuka,False
Itsuya,False
Shin,True


`.drop_duplicates()`  -- drops the row with duplicate, matching a duplicate works as .duplicated

In [None]:
df.drop_duplicates()

Unnamed: 0_level_0,score
name,Unnamed: 1_level_1
Kaizen,4
Jinwoo,3
Yuka,5
Itsuya,1


In [None]:
df.drop_duplicates(keep = 'last')

Unnamed: 0_level_0,score
name,Unnamed: 1_level_1
Jinhoo,4
Yuka,5
Itsuya,1
Shin,3


Unique elements, we got scores that has no duplicates

In [None]:
df.drop_duplicates(keep = False)

Unnamed: 0_level_0,score
name,Unnamed: 1_level_1
Yuka,5
Itsuya,1


In [None]:
df.duplicated(subset=['score'], keep = False)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kaizen,True
Jinwoo,True
Jinhoo,True
Yuka,False
Itsuya,False
Shin,True


### Splition | Data Preparation

In [None]:
split = pd.DataFrame({
    'Date':['1990_MK_12_pk', '1994_KP_13_df', '1997_OK_43_po', '2009_RF_76_uy', '2007_TG_90_ds']
}
)
split

Unnamed: 0,Date
0,1990_MK_12_pk
1,1994_KP_13_df
2,1997_OK_43_po
3,2009_RF_76_uy
4,2007_TG_90_ds


In [None]:
split['Date'].str.split('_')

Unnamed: 0,Date
0,"[1990, MK, 12, pk]"
1,"[1994, KP, 13, df]"
2,"[1997, OK, 43, po]"
3,"[2009, RF, 76, uy]"
4,"[2007, TG, 90, ds]"


`expand = True` -- creates a new DataFrame out of the splited cell elements

In [None]:
split['Date'].str.split('_', expand = True)

Unnamed: 0,0,1,2,3
0,1990,MK,12,pk
1,1994,KP,13,df
2,1997,OK,43,po
3,2009,RF,76,uy
4,2007,TG,90,ds


`.str.contains()`  --  checks whether an element is in the column

In [None]:
split['Date'].str.contains('MK')

Unnamed: 0,Date
0,True
1,False
2,False
3,False
4,False


In [None]:
split[split['Date'].str.contains('1990')]

Unnamed: 0,Date
0,1990_MK_12_pk


`.strip()`  -- deletes a free spaces from the sides

In [None]:
strip = pd.DataFrame({
    'English':['I   ', '  Love', '  You   '],
    'German':['    Ich    ', '   Liebe', 'Dir  ']
})
strip

Unnamed: 0,English,German
0,I,Ich
1,Love,Liebe
2,You,Dir


In [None]:
strip['English'].str.strip()

Unnamed: 0,English
0,I
1,Love
2,You


In [None]:
strip1 = strip.applymap(lambda x: ' '.join(x.strip().split()))
strip

  strip1 = strip.applymap(lambda x: ' '.join(x.strip().split()))


Unnamed: 0,English,German
0,I,Ich
1,Love,Liebe
2,You,Dir


In [None]:
ger = strip[['German']].replace(' ', '')
ger

Unnamed: 0,German
0,Ich
1,Liebe
2,Dir
