# Pandas
__________
**`Pandas:`** Pandas is a Python package that offers various data structures and operations for manipulating numerical data and time series.
              Mainly popular for importing and analyzing data much easier.

In [1]:
# !pip install pandas
import pandas as pd

> ## Series
_Pandas series is 1-D labeled array capable of holding data of any type._

In [6]:
# series through list

lst = [1, 2, 3, 4]
pd.Series(lst)

0    1
1    2
2    3
3    4
dtype: int64

In [8]:
# series through NUMPY

import numpy as np
arr = np.array([1, 2, 3, 4])
pd.Series(arr)

0    1
1    2
2    3
3    4
dtype: int64

__Giving Index from our own end__

In [13]:
pd.Series(data = ['mnr', 'ram', 'rahul', 'sai'], index = [1, 2 ,3, 4])

1      mnr
2      ram
3    rahul
4      sai
dtype: object

__Series through Dictionary__

In [14]:
d = {1: 'MNR', 2: 'pravs', 3: 'hari'}
pd.Series(d)

1      MNR
2    pravs
3     hari
dtype: object

#### Using `repeat` function along with creating a Series

Pandas Series.repeat() function repeat elements of a Series. It returns a new Series where each element of the current Series is repeated consecutively a given number of times.

In [4]:
pd.Series('MNR').repeat(3)

0    MNR
0    MNR
0    MNR
dtype: object

We can use the reset_index to make the index accurate

In [7]:
# type(pd.Series(5).repeat(3).reset_index())   --> DataFrame
pd.Series(5).repeat(3).reset_index(drop=True)

0    5
1    5
2    5
dtype: int64

The below code indicates:
- MNR should be repeated 3 times and 
- Narendra should be repeated 4 times
- Reddy should be repeated 3 times
- and so on

In [None]:
s = pd.Series(['MNR', "Narendra", 'Reddy']).repeat([3, 4, 3]).reset_index(drop=True)
s

#### Accessing elements

In [15]:
print(s[0], s[4], s[8])

MNR Narendra Reddy


In [None]:
# s[:]
# s[0:4]
# s[0:-5]
# s[0:10:2]
# s[::-1]

### `Aggregate function` on pandas Series

Pandas Series.aggregate() function aggregate using one or more operations over the specified axis in the given series object.

In [10]:
ar = pd.Series([1, 2, 3, 4, 5, 6, 7, 8])
# a = ar.agg([min:= 'min', max := 'max', sum := 'sum', 'mean', 'median', product := 'prod'])
# a[product]
ar.agg(['min', 'max', 'sum', 'mean', 'median', 'prod'])   # `agg` is an alias for `aggregate`

min           1.0
max           8.0
sum          36.0
mean          4.5
median        4.5
prod      40320.0
dtype: float64

### Series `absolute` function

> Pandas Series.abs() method is used to get the absolute numeric value of each element in Series/DataFrame.

In [None]:
sr = pd.Series([1, -3, 4, -67, 4, 5 -23])
sr.abs()

### `Appending Series`

> Pandas Series.concat() function is used to concatenate two or more series object.

In [6]:
sr1 = pd.Series(['a', 'b', 'c'])
sr2 = pd.Series(['f', 'r'])
sr3 = pd.concat([sr1, sr2]).reset_index(drop = True)
sr3

0    a
1    b
2    c
3    f
4    r
dtype: object

### `astype` function

> Used to chage data type of series

In [15]:
sr = pd.Series([1, -4, 5, -2])
sr

0    1
1   -4
2    5
3   -2
dtype: int64

In [16]:
sr.astype('float')

0    1.0
1   -4.0
2    5.0
3   -2.0
dtype: float64

### `between` function

> Pandas Series.between(): Return boolean Series equivalent to left <= series <= right.

In [20]:
sr = pd.Series([10, 20, 5, 3, 40, 30, 12])
sr.between(10, 30, inclusive='neither')     # default inclusive = 'both'

0    False
1     True
2    False
3    False
4    False
5    False
6     True
dtype: bool

### All `string functions` can to used to extract or modify strings in a series

> `upper`, `lower`, `title`, `capitalize`, `casefold` and `swapcase` functions

In [7]:
siri = pd.Series(["Narendra Reddy" , "Data Science" , "Geeks for Geeks" , 'Hello World' , 'Machine Learning'])

In [29]:
# siri.str.upper()
# siri.str.lower()
# siri.str.title()
# siri.str.capitalize()
# siri.str.casefold()
# siri.str.swapcase()

> `strip` function

In [None]:
siri = pd.Series(["  Narendra Reddy" , "Data Science  " , "Geeks for Geeks" , 'Hello World' , 'Machine Learning  '])
siri.str.strip()
# print(siri)

> `split` function

In [10]:
siri = pd.Series(["Narendra Reddy" , "Data Science" , "Geeks for Geeks" , 'Hello World' , 'Machine Learning'])
# siri.str.split(n = 1)
# siri.str.split()[0][1]
siri.str.split()

0      [Narendra, Reddy]
1        [Data, Science]
2    [Geeks, for, Geeks]
3         [Hello, World]
4    [Machine, Learning]
dtype: object

> `contains` function

In [None]:
siri = pd.Series(["Narendra Reddy" , "Data@Science" , "Geeks for Geeks" , 'Hello@World' , 'Machine Learning'])
siri.str.contains('@')

> `replace` function

In [None]:
siri = siri.str.replace('@', ' ')
print(siri)

> `count` function

In [None]:
siri.str.count('a')

> `startswith` and `endswith` function

In [56]:
# siri.str.startswith('N')
siri.str.endswith('s')

0    False
1    False
2     True
3    False
4    False
dtype: bool

> `find` function

In [None]:
# s = 'Narendra'
# s.find('d')     #Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
siri.str.find('Geeks')

In [None]:
siri.str.findall('Geeks')

### Converting Series to list

> `to_list()` function is used to convert series to list.

In [7]:
li = siri.to_list()
print(li)

['Narendra Reddy', 'Data Science', 'Geeks for Geeks', 'Hello World', 'Machine Learning']


# <font color='green'> Detailed coding Implementations on Pandas DataFrame </font>
___

In [16]:
li = [['gun', 30], ['bullet', 2], ['riffle', 40]]
pd.DataFrame(li)

Unnamed: 0,0,1
0,gun,30
1,bullet,2
2,riffle,40


In [19]:
pd.DataFrame([{1:'a', 2:'b', 3:'c', 4:'d'}, {1:'f', 2:'g',3:'h', 4:'i'}], index=['one','two'])

Unnamed: 0,1,2,3,4
one,a,b,c,d
two,f,g,h,i


In [12]:
data = {'Name':['Tom', 'nick', 'krish', 'jack'], 'Age':[20, 21, 19, 18]}
pd.DataFrame(data)

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


In [24]:
# d = {'a': 'A', 'b': 'B'}
# pd.DataFrame(d, index = [1])
d = {'Name':{1: ['Tom', 'mnr'], 2: 'nick', 3: 'krish', 4: 'jack'}, 'Age':{1: 20, 'b': 21, 'c': 19, 'd': 18}}
pd.DataFrame(d)

Unnamed: 0,Name,Age
1,"[Tom, mnr]",20.0
2,nick,
3,krish,
4,jack,
b,,21.0
c,,19.0
d,,18.0


In [None]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

In [None]:
tuples = [('apple', 10), ('banana', 20), ('cherry', 30), ('date', 40), ('elderberry', 50)]
df = pd.DataFrame(tuples, columns=['Fruit', 'Quantity'])
df.set_index('Fruit')

### Slicing in DataFrame using `iloc` and `loc`

In [6]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield', 'war'])
df

Unnamed: 0,max_speed,shield,war
cobra,1,2,3
viper,4,5,6
sidewinder,7,8,9


#### `loc` operations

In [16]:
print(f'type: {type(df.loc['viper'])}')
df.loc['viper']

type: <class 'pandas.core.series.Series'>


max_speed    4
shield       5
war          6
Name: viper, dtype: int64

In [7]:
df.loc[['viper', 'cobra']]

Unnamed: 0,max_speed,shield,war
viper,4,5,6
cobra,1,2,3


In [18]:
df.loc['viper', 'max_speed']

np.int64(4)

In [14]:
df.loc['cobra':'viper', 'max_speed']

cobra    1
viper    4
Name: max_speed, dtype: int64

In [20]:
# df.loc[['cobra', 'viper'] , ['max_speed', 'shield']]
df.loc['cobra':'viper', 'shield':'war']

Unnamed: 0,shield,war
cobra,2,3
viper,5,6


In [10]:
df.loc[[False, True, False]]          # The length of boolean list is must be equal to number of rows
# df.loc[[False, True, False], [False, True, False]]

Unnamed: 0,max_speed,shield,war
viper,4,5,6


In [28]:
df.loc[pd.Series([False, False, True], index=['cobra', 'viper', 'sidewinder'])]          #Alignable boolean Series

Unnamed: 0,max_speed,shield,war
sidewinder,7,8,9


In [30]:
df.loc[df['shield'] > 4]

Unnamed: 0,max_speed,shield,war
viper,4,5,6
sidewinder,7,8,9


In [36]:
df.loc[df['shield'] > 4, ['shield', 'war']]

Unnamed: 0,shield,war
viper,5,6
sidewinder,8,9


In [38]:
df.loc[lambda df: df['shield'] == 8]

Unnamed: 0,max_speed,shield,war
sidewinder,7,8,9


Set value for all items matching the list of labels

In [40]:
df.loc[['cobra', 'viper'], ['war']] = 23
df

Unnamed: 0,max_speed,shield,war
cobra,1,2,23
viper,4,5,23
sidewinder,7,8,9


set value for an entire row

In [42]:
df.loc['sidewinder'] = 100
df

Unnamed: 0,max_speed,shield,war
cobra,1,2,23
viper,4,5,23
sidewinder,100,100,100


set value for an entire column

In [43]:
df.loc[:, 'max_speed'] = 40
df

Unnamed: 0,max_speed,shield,war
cobra,40,2,23
viper,40,5,23
sidewinder,40,100,100


In [11]:
tuples = [
    ('cobra', 'mark i'), ('cobra', 'mark ii'),
    ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'),
    ('viper', 'mark ii'), ('viper', 'mark iii')
]
index = pd.MultiIndex.from_tuples(tuples)
values = [[12, 2], [0, 4], [10, 20],
          [1, 4], [7, 1], [16, 36]]
df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index)
df

Unnamed: 0,Unnamed: 1,max_speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1
viper,mark iii,16,36


#### `iloc` operations

> iloc is an index based selecting method which means we have to pass purely integer index to select a specific row/ column

In [3]:
mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}]
df = pd.DataFrame(mydict)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [7]:
# df.iloc[0]         --> Series
df.iloc[[0]]      # -->DataFrame

Unnamed: 0,a,b,c,d
0,1,2,3,4


In [18]:
# df.iloc[[0,1]]
df.iloc[[0,2], [0, 1, 3]]

Unnamed: 0,a,b,d
0,1,2,4
2,1000,2000,4000


In [27]:
df.iloc[0:-1, 0:-2]

Unnamed: 0,a,b
0,1,2
1,100,200


In [19]:
df.iloc[[True, False, True]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


In [26]:
# df.iloc[lambda df: [0, 2]]             here the result is same if, iloc <-> loc
df.iloc[lambda x: x.index %2 == 0]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


### Slicing using conditions

In [32]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [39]:
df.loc[df['two'] > 20, ['three', 'two']]
# df[df['two'] > 20]

Unnamed: 0,three,two
2,300,30
3,400,40


### Adding Column into DataFrame

We can add a column in many ways. Let us discuss three ways how we can add column here
- Using List
- Using Pandas Series
- Using an existing Column(we can modify that column in the way we want and that modified part can also be displayed)

In [None]:
li = [11, 22, 33, 44]
df['five'] = li
df

In [None]:
sr = pd.Series([111, 222,333, 444])
df['six'] = sr
df

In [None]:
df['seven'] = df['one']+10
df

### deleting Column in DataFrame

- del
- drop
- pop

In [None]:
del df['six']
df

In [50]:
# df.drop(['seven'], axis=1, inplace=True)
# or                                                 axis = 1 -> columns,   axis = 0 -> rows
# df.drop(columns=['seven'], inplace=True)

In [55]:
# df.pop('five')

### Adding row into DataFrame

In [63]:
data = {'Fruit': ['apple', 'banana', 'cherry'], 'Quantity': [10, 20, 30]}
df = pd.DataFrame(data)

new_row = {'Fruit': ['date'], 'Quantity': [40]}
df_new = pd.DataFrame(new_row)

df = pd.concat([df, df_new], ignore_index=True)
df


Unnamed: 0,Fruit,Quantity
0,apple,10
1,banana,20
2,cherry,30
3,date,40


#### Transposing a DataFrame

pandas `DataFrame.T ` instance is used to transpose the DataFrame, i.e., to flip the rows and columns.

In [3]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [4]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


### more DataFrame functionalities

- axes instance
> Return a list representing the axes of the DataFrame.

In [7]:
print(df.axes)

[RangeIndex(start=0, stop=4, step=1), Index(['one', 'two', 'three', 'four'], dtype='object')]


- ndim
>Return an int representing the number of axes / array dimensions. Return 1 if Series. Otherwise return 2 if DataFrame.

In [8]:
df.ndim

2

- dtypes
> This returns a Series with the data type of each column.

In [10]:
df.dtypes

one      int64
two      int64
three    int64
four     int64
dtype: object

- shape

In [11]:
df.shape

(4, 4)

`head(), tail()`

In [None]:
d = {'length': np.linspace(1, 19, num = 10), 'breadth': np.linspace(100, 199, num = 10)}
df = pd.DataFrame(d)
df

In [20]:
df.head(-8)

Unnamed: 0,length,breadth
0,1.0,100.0
1,3.0,111.0


In [None]:
df.tail()

- empty

In [24]:
df_emp = pd.DataFrame()
df_emp.empty

True

### Statistical or Mathmatical Functions

In [32]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


- DataFrame.sum : Return the sum over the requested axis.
- DataFrame.min : Return the minimum over the requested axis.
- DataFrame.max : Return the maximum over the requested axis.
- DataFrame.idxmin : Return the index of the minimum over the requested axis.
- DataFrame.idxmax : Return the index of the maximum over the requested axis.

In [None]:
# df.sum()
# df[['one', 'two']].sum()

# performing sum operation along rows
# df.loc[0].sum()
# df.loc[0, ['two', 'three']].sum()

# df.sum(axis=1

# df.max(axis=0)

# df.idxmax(axis=1)

2. mean()

In [None]:
df.mean()

In [None]:
df.mode()

In [None]:
df.median()

In [None]:
df.var()

In [None]:
df.std()

### `describe` function

In [47]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000]),
        'five'  : pd.Series(['A', 'B', 'C', 'D'])}
df = pd.DataFrame(data)
df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


> Note: quantile function is used to find percentiles of the data

In [52]:
print(f'25%: {df['one'].quantile(.25)}')    # 25%
df[['one']].quantile(.75)

25%: 1.75


one    3.25
Name: 0.75, dtype: float64

In [54]:
df.describe(exclude=[object])

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


In [55]:
df.describe(include=[object])

Unnamed: 0,five
count,4
unique,4
top,A
freq,1


In [56]:
# describing a column by taking col as attribute from DataFrame
df.two.describe()

count     4.000000
mean     25.000000
std      12.909944
min      10.000000
25%      17.500000
50%      25.000000
75%      32.500000
max      40.000000
Name: two, dtype: float64

In [57]:
df[['two', 'four']].describe()

Unnamed: 0,two,four
count,4.0,4.0
mean,25.0,2500.0
std,12.909944,1290.994449
min,10.0,1000.0
25%,17.5,1750.0
50%,25.0,2500.0
75%,32.5,3250.0
max,40.0,4000.0
