---
---
# **Pandas**

#### **1. input/output**

1.1 read_csv  
1.2 read_excel  
1.3 Dataframe.to_csv  
1.4 Dataframe.to_excel  

#### **2. General Function**

2.1 pivot    
2.2 crosstab  
2.3 merge  
2.4 concat  
2.5 get_dummies
2.6 unique  
2.7 isna  
2.8 isnull  
2.9 notna  
2.10 to_datetime  

#### **3. Series**

3.1 index  
3.2 array  
3.3 values  
3.4 dtype  
3.5 shape  
3.6 ndim  
3.7 size  
3.8 T  
3.9 empty  
3.10 dtypes  
3.11 name  

#### **4.Dataframe**

4.1 index  
4.2 dtypes  
4.3 column  
4.4 info  
4.5 select_dtypes  
4.6 values  
4.7 axes  
4.8 ndim  
4.9 size  
4.10 shape  
4.11 empty  
4.12 astype  
4.13 bool  
4.14 head  
4.15 loc    
4.16 iloc  
4.17 items  
4.18 keys  
4.19 pop  
4.20 tail  
4.21 head  
4.22 where  
4.23 query  
4.24 apply  
4.25 agg  
4.26 transform  
4.27 groupby  
4.28 rolling  
4.29 corr  
4.30 quantile  
4.31 value_counts  
4.32 drop  
4.33 drop_duplicates  
4.34 duplicated  
4.35 filter  
4.36 rename  
4.37 dropna  

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

---

--- 

#### **1. input/output**

1.1 read_csv  
1.2 read_excel  
1.3 Dataframe.to_csv  
1.4 Dataframe.to_excel  

# 1.1 read_csv

In [2]:
data_csv = pd.read_csv('S:\\Python_and_sql\\cars.csv')
data_csv.head(3)

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.


# 1.2 read_excel

In [3]:
data_excel = pd.read_excel('S:\\Python_and_sql\\people.xlsx')
data_excel.head(3)

Unnamed: 0,satisfactoryLevel,lastEvaluation,numberOfProjects,avgMonthlyHours,timeSpent.company,workAccident,left,promotionInLast5years,dept,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium


# 1.3 Dataframe.to_csv

In [4]:
df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
                   index=['row 1', 'row 2'],
                   columns=['col 1', 'col 2'])
df1.to_csv("df_csv.csv")  

# 1.4 Dataframe.to_excel

In [5]:
df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
                   index=['row 1', 'row 2'],
                   columns=['col 1', 'col 2'])
df1.to_excel("df_excel.xlsx")  

--- 
---

#### **2. General Function**

2.1 pivot  
2.2 pivot_table  
2.3 crosstab  
2.4 merge  
2.5 concat  
2.6 get_dummies
2.7 unique  
2.8 isna  
2.9 isnull  
2.10 notna  
2.11 to_datetime  

In [6]:
# main dataframe for practice

data = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                  ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c','a']]).T

df2 = pd.DataFrame(data, columns=['numbers', 'alphabets'])

print(df2)

  numbers alphabets
0       1         a
1       2         b
2       3         c
3       4         a
4       5         b
5       6         c
6       7         a
7       8         b
8       9         c
9      10         a


# 2.1 pivot

In [7]:
data = {
    'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],
    'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles'],
    'Temperature': [32, 75, 30, 72]
}

df = pd.DataFrame(data)

pivot_df = df.pivot(index='Date', columns='City', values='Temperature')


#########################################################################################################
print('\n   dataset   \n')
print(df)
print('\n   pivoted table   \n')
print(pivot_df)


   dataset   

         Date         City  Temperature
0  2023-01-01     New York           32
1  2023-01-01  Los Angeles           75
2  2023-01-02     New York           30
3  2023-01-02  Los Angeles           72

   pivoted table   

City        Los Angeles  New York
Date                             
2023-01-01           75        32
2023-01-02           72        30


# 2.2 crosstab 

In [8]:
# crosstab normal

crosstab = pd.crosstab(df['Date'], df['City'])
print(crosstab)

City        Los Angeles  New York
Date                             
2023-01-01            1         1
2023-01-02            1         1


In [9]:
# crosstab with aggregate function (sum)

crosstab_mean = pd.crosstab(df['Date'], df['City'], values=df['Temperature'], aggfunc='sum')
print(crosstab_mean)

City        Los Angeles  New York
Date                             
2023-01-01           75        32
2023-01-02           72        30


# 2.3 merge  

In [10]:
# creating two dataframes 

df1 = pd.DataFrame({                        # df1
    'ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
})                                  
print('df1 \n', df1,'\n\n')

df2 = pd.DataFrame({                        # df2
    'ID': [2, 3, 4],
    'Score': [85, 90, 95]
})
print('df2 \n', df2 ,'\n\n')


df1 
    ID     Name
0   1    Alice
1   2      Bob
2   3  Charlie 


df2 
    ID  Score
0   2     85
1   3     90
2   4     95 




In [11]:
# merge function works as joins in sql

# merging df1 and df2 on ID column 

inner_merged_df = df1.merge(df2, on='ID')  # inner join

# left table merge   here df1 is taken as left

left_merged_df = df1.merge(df2, on='ID', how='left')  # left join 

# right table merge   here df2 is taken as right

right_merged_df = df1.merge(df2, on='ID', how='right') # right join


# right table merge   here df2 is taken as right

outer_merged_df = df1.merge(df2, on='ID', how='outer') # right join


# for not getting confused creating a print ladder

print('inner join')
print(inner_merged_df)
print('\n')
print('left join')
print(left_merged_df)
print('\n')
print('right join')
print(right_merged_df)
print('\n')
print('outer join')
print(outer_merged_df)
print('\n')


inner join
   ID     Name  Score
0   2      Bob     85
1   3  Charlie     90


left join
   ID     Name  Score
0   1    Alice    NaN
1   2      Bob   85.0
2   3  Charlie   90.0


right join
   ID     Name  Score
0   2      Bob     85
1   3  Charlie     90
2   4      NaN     95


outer join
   ID     Name  Score
0   1    Alice    NaN
1   2      Bob   85.0
2   3  Charlie   90.0
3   4      NaN   95.0




In [12]:
# Merge on Different Column Names

emp = pd.DataFrame({
    'EmployeeID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
})

score = pd.DataFrame({
    'ID': [2, 3, 4],
    'Score': [85, 90, 95]
})

diffcol_merge_df = emp.merge(score, left_on='EmployeeID', right_on='ID', how='inner')


#########################################################################################################
# merging on index

emp1 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score': [85, 90, 95]
}, index=[1, 2, 3])

dep = pd.DataFrame({
    'Department': ['HR', 'IT', 'Finance']
}, index=[1, 2, 4])

index_merge_df = emp1.merge(dep, left_index=True, right_index=True, how='outer')

#########################################################################################################

print('diffcol_merge_df \n')
print(diffcol_merge_df)
print('\n')
print('index_merge_df \n')
print(index_merge_df)


diffcol_merge_df 

   EmployeeID     Name  ID  Score
0           2      Bob   2     85
1           3  Charlie   3     90


index_merge_df 

      Name  Score Department
1    Alice   85.0         HR
2      Bob   90.0         IT
3  Charlie   95.0        NaN
4      NaN    NaN    Finance


# 2.4 concat 

In [13]:
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

# Concatenating Along Rows
concat_row = pd.concat([df1, df2])

#########################################################################################################
# Concatenating Along Columns
concat_col = pd.concat([df1, df2], axis=1)

#########################################################################################################

print('df1')
print(df1)
print('\n')
print('df2')
print(df2)
print('\n')
print('concat_row')
print(concat_row)
print('\n')
print('concat_col')
print(concat_col)
print('\n')

df1
   A  B
0  1  3
1  2  4


df2
   A  B
0  5  7
1  6  8


concat_row
   A  B
0  1  3
1  2  4
0  5  7
1  6  8


concat_col
   A  B  A  B
0  1  3  5  7
1  2  4  6  8




# 2.5 get_dummies

In [14]:
df = pd.DataFrame({
    'Animal': ['cat', 'dog', 'bird', 'dog'],
    'Color': ['black', 'white', 'black', 'white']
})

# Basic Usage with a Series
seriesdum = pd.get_dummies(df['Animal'])

#####################################################################################################################
# One-Hot Encoding with a DataFrame
one_hotdum = pd.get_dummies(df)

#####################################################################################################################
# Adding a Prefix
prefixdum = pd.get_dummies(df, prefix=['Type', 'Shade'])

#####################################################################################################################
# Encoding Specific Columns
specificdum = pd.get_dummies(df, columns=['Animal'])

print(df)
print('Series')
print(seriesdum)
print('One-Hot')
print(one_hotdum)
print('Prefix')
print(prefixdum)
print('Specific')
print(specificdum)

  Animal  Color
0    cat  black
1    dog  white
2   bird  black
3    dog  white
Series
    bird    cat    dog
0  False   True  False
1  False  False   True
2   True  False  False
3  False  False   True
One-Hot
   Animal_bird  Animal_cat  Animal_dog  Color_black  Color_white
0        False        True       False         True        False
1        False       False        True        False         True
2         True       False       False         True        False
3        False       False        True        False         True
Prefix
   Type_bird  Type_cat  Type_dog  Shade_black  Shade_white
0      False      True     False         True        False
1      False     False      True        False         True
2       True     False     False         True        False
3      False     False      True        False         True
Specific
   Color  Animal_bird  Animal_cat  Animal_dog
0  black        False        True       False
1  white        False       False        True
2  black        

# 2.6 unique

In [15]:
df = pd.DataFrame({
    'A': [1, 2, 2, 3],
    'B': ['x', 'y', 'x', 'z']
})

unique_values = df['B'].unique()
print(unique_values)

['x' 'y' 'z']


# 2.7 isna

In [16]:
df = pd.DataFrame({
    'A': [1, None, 3],
    'B': [4, 5, None],
    'C': [None, None, None]
})

# isna

missing = df.isna()
print(missing)


       A      B     C
0  False  False  True
1   True  False  True
2  False   True  True


# 2.8 isnull

In [17]:
# isnull
missing = df.isnull()
print(missing)

       A      B     C
0  False  False  True
1   True  False  True
2  False   True  True


# 2.9 notna

In [18]:
# notna
non_missing = df.notna()
print(non_missing)


       A      B      C
0   True   True  False
1  False   True  False
2   True  False  False


# 2.10 to_datetime

In [19]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],
                   'day': [4, 5]})
print(df)

pd.to_datetime(df)

   year  month  day
0  2015      2    4
1  2016      3    5


0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

---
---

#### **3. Series**

3.1 index  
3.2 array  
3.3 values  
3.4 dtype  
3.5 shape  
3.6 ndim  
3.7 size  
3.8 T  
3.9 empty  
3.10 dtypes  
3.11 name  

# 3.1 index  

In [20]:
series = pd.Series([10, 20, 30],index=['x', 'y', 'z'])
print(series.index)

Index(['x', 'y', 'z'], dtype='object')


# 3.2 array 

In [21]:
print(series.array)

###################################################################################
print( '\n\nseries\n' , series )

<NumpyExtensionArray>
[10, 20, 30]
Length: 3, dtype: int64


series
 x    10
y    20
z    30
dtype: int64


# 3.3 values  

In [22]:
print(series.values)

###################################################################################
print( '\n\nseries\n' , series )

[10 20 30]


series
 x    10
y    20
z    30
dtype: int64


# 3.4 dtype  

In [23]:
print(series.dtype)  # For Series

###################################################################################
print( '\n\nseries\n' , series )

int64


series
 x    10
y    20
z    30
dtype: int64


# 3.5 shape  

In [24]:
print(series.shape) 

###################################################################################
print( '\n\nseries\n' , series )

(3,)


series
 x    10
y    20
z    30
dtype: int64


# 3.6 ndim  

In [25]:
print(series.ndim)  

###################################################################################
print( '\n\nseries\n' , series )

1


series
 x    10
y    20
z    30
dtype: int64


# 3.7 size  

In [26]:
print(series.size) 

###################################################################################
print( '\n\nseries\n' , series )

3


series
 x    10
y    20
z    30
dtype: int64


# 3.8 T  

In [27]:
print(series.T) #transpose

###################################################################################
print( '\n\nseries\n' , series )

x    10
y    20
z    30
dtype: int64


series
 x    10
y    20
z    30
dtype: int64


# 3.9 empty  

In [28]:
empty_ser = pd.Series()         
print(empty_ser.empty)  # True , empty_ser is empty
print(series.empty)     # False ,series is not empty

True
False


# 3.10 dtypes  

In [29]:
print(series.dtypes)

###################################################################################
print( '\n\nseries\n' , series )

int64


series
 x    10
y    20
z    30
dtype: int64


# 3.11 name  

In [30]:
series.name = 'Numbers' # series can also be named
print(series.name)

###################################################################################
print( '\n\nseries\n' , series )

Numbers


series
 x    10
y    20
z    30
Name: Numbers, dtype: int64


---
---
#### **4.Dataframe**

4.1 index  
4.2 dtypes  
4.3 column  
4.4 info  
4.5 select_dtypes  
4.6 values  
4.7 axes  
4.8 ndim  
4.9 size  
4.10 shape  
4.11 empty  
4.12 astype  
4.13 bool  
4.14 head  
4.15 loc    
4.16 iloc  
4.17 items  
4.18 keys  
4.19 pop  
4.20 tail  
4.21 head  
4.22 where  
4.23 query  
4.24 apply  
4.25 agg  
4.26 transform  
4.27 groupby  
4.28 rolling  
4.29 corr  
4.30 quantile  
4.31 value_counts  
4.32 drop  
4.33 drop_duplicates  
4.34 duplicated  
4.35 filter  
4.36 rename  
4.37 dropna  

# 4.1 index  

In [31]:
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5 ,6 ,7 ,8], 'C': [9, 10, 11, 12]}, index=['a','b','c','d'])
print(df.index)

Index(['a', 'b', 'c', 'd'], dtype='object')


# 4.2 dtypes   

In [32]:
print(df.dtypes)

##########################################################################
print('\ndf\n', df ,'\n')

A    int64
B    int64
C    int64
dtype: object

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.3 column  


In [33]:
print(df.columns)

##########################################################################
print('\ndf\n', df ,'\n')

Index(['A', 'B', 'C'], dtype='object')

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.4 info  

In [34]:
print(df.info())

##########################################################################
print('\ndf\n', df ,'\n')

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       4 non-null      int64
 1   B       4 non-null      int64
 2   C       4 non-null      int64
dtypes: int64(3)
memory usage: 128.0+ bytes
None

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.5 select_dtypes  


In [35]:
print(df.select_dtypes(include='int64'))

##########################################################################
print('\ndf\n', df ,'\n')

   A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.6 values  


In [36]:
print(df.values)

##########################################################################
print('\ndf\n', df ,'\n')

[[ 1  5  9]
 [ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]]

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.7 axes  

In [37]:
print(df.axes)

##########################################################################
print('\ndf\n', df ,'\n')

[Index(['a', 'b', 'c', 'd'], dtype='object'), Index(['A', 'B', 'C'], dtype='object')]

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.8 ndim  

In [38]:
print(df.ndim)

##########################################################################
print('\ndf\n', df ,'\n')

2

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.9 size  

In [39]:
print(df.size)

##########################################################################
print('\ndf\n', df ,'\n')

12

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.10 shape  

In [40]:
print(df.shape)

##########################################################################
print('\ndf\n', df ,'\n')

(4, 3)

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.11 empty  

In [41]:
print(df.empty)

##########################################################################
print('\ndf\n', df ,'\n')

False

df
    A  B   C
a  1  5   9
b  2  6  10
c  3  7  11
d  4  8  12 



# 4.12 astype  

In [42]:
df['A'] = df['A'].astype(float)
print(df.dtypes)

##########################################################################
print('\ndf\n', df ,'\n')

A    float64
B      int64
C      int64
dtype: object

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.13 bool  


In [43]:
print(bool(df["A"][1]))

##########################################################################
print('\ndf\n', df ,'\n')

True

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.14 head  


In [44]:
print(df.head())

##########################################################################
print('\ndf\n', df ,'\n')

     A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.15 loc    


In [45]:
print(df.loc['a'])

##########################################################################
print('\ndf\n', df ,'\n')

A    1.0
B    5.0
C    9.0
Name: a, dtype: float64

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.16 iloc  


In [46]:
print('\ndf.iloc[2]')

print(df.iloc[2]) # returns value at that index row (0,1,2 so third row)

print('\ndf.iloc[2,2]')

print(df.iloc[2,2]) # returns value at [row and col] index

##########################################################################
print('\ndf\n', df ,'\n')


df.iloc[2]
A     3.0
B     7.0
C    11.0
Name: c, dtype: float64

df.iloc[2,2]
11

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.17 items  


In [47]:
for col, data in df.items():
    print(col, data.values)

##########################################################################
print('\ndf\n', df ,'\n')

A [1. 2. 3. 4.]
B [5 6 7 8]
C [ 9 10 11 12]

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.18 keys  


In [48]:
print(df.keys())

##########################################################################
print('\ndf\n', df ,'\n')

Index(['A', 'B', 'C'], dtype='object')

df
      A  B   C
a  1.0  5   9
b  2.0  6  10
c  3.0  7  11
d  4.0  8  12 



# 4.19 pop  


In [49]:
popped_col = df.pop('A')
print(popped_col)

##########################################################################
print('\ndf\n', df ,'\n')

a    1.0
b    2.0
c    3.0
d    4.0
Name: A, dtype: float64

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.20 tail  


In [50]:
print(df.tail(1))    # usually returns last 5 column but u can give number of column

##########################################################################
print('\ndf\n', df ,'\n')

   B   C
d  8  12

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.21 head  


In [51]:
print(df.head(2)) # usually returns first 5 column but u can give number of column

##########################################################################
print('\ndf\n', df ,'\n')

   B   C
a  5   9
b  6  10

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.22 where  


In [52]:
print(df.where(df > 9))

##########################################################################
print('\ndf\n', df ,'\n')

    B     C
a NaN   NaN
b NaN  10.0
c NaN  11.0
d NaN  12.0

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.23 query  


In [53]:
filtered = df.query('B > 6')
print(filtered)

##########################################################################
print('\ndf\n', df ,'\n')

   B   C
c  7  11
d  8  12

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.24 apply  


In [54]:
print(df.apply(sum, axis=0))  # Sum of columns

##########################################################################
print('\ndf\n', df ,'\n')

B    26
C    42
dtype: int64

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.25 agg  


In [55]:
print(df.agg(['sum', 'mean']))

##########################################################################
print('\ndf\n', df ,'\n')

         B     C
sum   26.0  42.0
mean   6.5  10.5

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.26 transform  


In [56]:
print(df.transform(lambda x: x * 2))

##########################################################################
print('\ndf\n', df ,'\n')

    B   C
a  10  18
b  12  20
c  14  22
d  16  24

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.27 groupby  


In [57]:
grouped = df.groupby('B').sum()
grouped

Unnamed: 0_level_0,C
B,Unnamed: 1_level_1
5,9
6,10
7,11
8,12


# 4.28 rolling  


In [58]:
print(df['B'].rolling(2).mean())

##########################################################################
print('\ndf\n', df ,'\n')

a    NaN
b    5.5
c    6.5
d    7.5
Name: B, dtype: float64

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.29 corr  


In [59]:
print(df.corr())

##########################################################################
print('\ndf\n', df ,'\n')

     B    C
B  1.0  1.0
C  1.0  1.0

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.30 quantile  


In [60]:
print(df.quantile(0.5))

##########################################################################
print('\ndf\n', df ,'\n')

B     6.5
C    10.5
Name: 0.5, dtype: float64

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.31 value_counts  


In [61]:
print(df['B'].value_counts())

##########################################################################
print('\ndf\n', df ,'\n')

B
5    1
6    1
7    1
8    1
Name: count, dtype: int64

df
    B   C
a  5   9
b  6  10
c  7  11
d  8  12 



# 4.32 drop  


In [62]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 2, 6]
}, index=[0, 1, 2])

# Dropping the row with index 1
df_dropped = df.drop(index=1)
print(df_dropped)

##########################################################################
print('\ndf\n', df ,'\n')

   A  B
0  1  4
2  3  6

df
    A  B
0  1  4
1  2  2
2  3  6 



# 4.33 drop_duplicates  


In [63]:
print(df.drop_duplicates())

##########################################################################
print('\ndf\n', df ,'\n')

   A  B
0  1  4
1  2  2
2  3  6

df
    A  B
0  1  4
1  2  2
2  3  6 



# 4.34 duplicated  


In [64]:
print(df.duplicated())

##########################################################################
print('\ndf\n', df ,'\n')

0    False
1    False
2    False
dtype: bool

df
    A  B
0  1  4
1  2  2
2  3  6 



# 4.35 filter  


In [65]:
print(df.filter(items=['B']))

##########################################################################
print('\ndf\n', df ,'\n')

   B
0  4
1  2
2  6

df
    A  B
0  1  4
1  2  2
2  3  6 



# 4.36 rename  
  

In [66]:
print(df.rename(columns={'A': 'Alpha'}))

##########################################################################
print('\ndf\n', df ,'\n')

   Alpha  B
0      1  4
1      2  2
2      3  6

df
    A  B
0  1  4
1  2  2
2  3  6 



# 4.37 dropna

In [67]:
df = pd.DataFrame({
    'A': [1, None, 3],
    'B': [4, 2, 6]
}, index=[0, 1, 2])

print(df.dropna(inplace=False)) # inplace = true means it will drop prementally 

##########################################################################
print('\ndf\n', df ,'\n')

     A  B
0  1.0  4
2  3.0  6

df
      A  B
0  1.0  4
1  NaN  2
2  3.0  6 



# Group by examples

**df.groupby( [ categorical_name ] ) [ numerical_name ].agg( agregate_functions)**

![at1.jpg](attachment:at1.jpg)

**df.groupby( [ categoricalcolumns ] ) [ [ numerical_columns ] ].agg ( [ aggregate_functions ] )**

![at2.jpg](attachment:at2.jpg)

**df.groupby( [ categorical_columns ]  ).agg( { numericalcol: ( aggregatefunctions ), numericalcol2 : aggregatefunction } )**


![WhatsApp Image 2024-12-20 at 15.06.23_8a96b71e.jpg](<attachment:WhatsApp Image 2024-12-20 at 15.06.23_8a96b71e.jpg>)