<h1>Operations with DataFrames</h1>

In [1]:
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [2]:
#The info() method provides important information about the columns, such as the data type.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    4 non-null      int64 
 1   col2    4 non-null      int64 
 2   col3    4 non-null      object
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


In [3]:
#The memory_usage() function of the pandas
#library is used to obtain the memory usage of a DataFrame or Series in bytes.

df.memory_usage()

Index    128
col1      32
col2      32
col3      32
dtype: int64

<h3>Informations about unique values</h3>

In [4]:
#The unique() function in pandas is used to get the unique values from a series or dataframe.

df['col2'].unique()

array([444, 555, 666], dtype=int64)

In [5]:
#The nunique() function in pandas is used to count the number of unique values in a series or dataframe.

df['col2'].nunique()

3

In [6]:
#The value_counts() function in pandas is used to get a count of unique values in a series or dataframe.

df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

<h3>Applying functions</h3>

In [8]:
#Defining a function

def comp(x):
    return x ** 2 + 3

In [9]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [10]:
#The apply() function in pandas is used to apply a function along a specific axis of a DataFrame or Series.

df['col1'].apply(comp)

0     4
1     7
2    12
3    19
Name: col1, dtype: int64

In [11]:
#I can use the apply() to create a new column with the results

df['col1_calc'] = df['col1'].apply(comp)

In [12]:
df

Unnamed: 0,col1,col2,col3,col1_calc
0,1,444,abc,4
1,2,555,def,7
2,3,666,ghi,12
3,4,444,xyz,19


In [13]:
#I can use lambda on apply

df['col1'].apply(lambda x: x ** 2 + 3)

0     4
1     7
2    12
3    19
Name: col1, dtype: int64

In [14]:
#The sum() function in pandas is used to calculate the sum of values in a DataFrame or Series.

df['col1'].sum()

10

In [15]:
#The mean() function in pandas is used to calculate the average of values in a DataFrame or Series.

df['col1'].mean()

2.5

In [16]:
#The product() function in pandas is used to calculate the product of values in a DataFrame or Series.

df['col1'].product()

24

In [17]:
#The std() function in pandas is used to calculate the standard deviation of values in a DataFrame or Series.

df['col1'].std()

1.2909944487358056

In [18]:
#The max() function in pandas is used to get the maximum value from a DataFrame or Series.

df['col1'].max()

4

In [19]:
#The min() function in pandas is used to get the minimum value from a DataFrame or Series.

df['col1'].min()

1

In [20]:
#The idxmax() function in pandas is used to get the index of the maximum value in a DataFrame or Series.

df['col1'].idxmax()

3

In [21]:
#I can combine functions with filters, for example, 
#I want to sum the values in col1 only where col2 is 444.

df[df['col2'] == 444]

Unnamed: 0,col1,col2,col3,col1_calc
0,1,444,abc,4
3,4,444,xyz,19


In [25]:
df[df['col2'] == 444]['col1'].sum()

5

In [27]:
#The sort_values() function in pandas is used to sort a DataFrame or Series by values.
#I need to inform which column to use as a reference.

df.sort_values(by='col2')

Unnamed: 0,col1,col2,col3,col1_calc
0,1,444,abc,4
3,4,444,xyz,19
1,2,555,def,7
2,3,666,ghi,12


In [28]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [29]:
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [31]:
#The map() function in pandas is used to apply a function or a dictionary
#to a DataFrame or Series for mapping values.

#"I'm indicating through this dictionary that whenever the data within the column is equal to the key,
#it will be replaced by the dictionary value. For example, if it is 'one', it will be replaced by '1',
#if it is 'two', it will be replaced by '2'. After that, I will create a new column named 'E' with these
#results.


dict_map = {"one": "1", "two": "2"}

df['E'] = df['B'].map(dict_map)

df

Unnamed: 0,A,B,C,D,E
0,foo,one,x,1,1
1,foo,one,y,3,1
2,foo,two,x,2,2
3,bar,two,y,5,2
4,bar,one,x,4,1
5,bar,one,y,1,1


In [37]:
#pivot_table() is a function in Pandas that allows us to summarize and aggregate 
#data in a table by creating a pivot table.

df.pivot_table(index='A', columns='B', values='D', aggfunc='sum')

B,one,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,5,5
foo,4,2
