pandas

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Series: array like object and associated data labels
obj = pd.Series([9,-2,4,0])
obj

0    9
1   -2
2    4
3    0
dtype: int64

In [3]:
obj.array

<NumpyExtensionArray>
[9, -2, 4, 0]
Length: 4, dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# index as labels
obj2 = pd.Series([4,7,-1,3], index=["q","p","o","l"])
obj2

q    4
p    7
o   -1
l    3
dtype: int64

In [6]:
obj2["l"] # Acess

3

In [7]:
obj2[["l","q"]]

l    3
q    4
dtype: int64

In [8]:
#  NumPy-like operations, such as filtering with a Boolean array, scalar multiplication,
#  or applying math functions, will preserve the index-value
obj[obj>2]

0    9
2    4
dtype: int64

In [9]:
obj * obj

0    81
1     4
2    16
3     0
dtype: int64

In [10]:
np.exp(obj2)

q      54.598150
p    1096.633158
o       0.367879
l      20.085537
dtype: float64

In [11]:
# series as fixed length dict
"q" in obj2

True

In [12]:
dic = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj1 = pd.Series(dic)
obj

0    9
1   -2
2    4
3    0
dtype: int64

In [13]:
# Series can be converted back to dict
obj1.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [14]:
# To describe the certain order of indices
index_order = ["California","Texas","Ohio","Oregon"]
obj = pd.Series(dic,index=index_order)
obj

California        NaN
Texas         71000.0
Ohio          35000.0
Oregon        16000.0
dtype: float64

In [15]:
# To detect null values
obj.isna()

California     True
Texas         False
Ohio          False
Oregon        False
dtype: bool

In [16]:
pd.notna(obj)

California    False
Texas          True
Ohio           True
Oregon         True
dtype: bool

In [17]:
obj + obj1 # Series automatically alligns by index values

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [18]:
# We can assign name to the Series object and index as well
obj.name = "poplulation"
obj.index.name = "states"
obj

states
California        NaN
Texas         71000.0
Ohio          35000.0
Oregon        16000.0
Name: poplulation, dtype: float64

In [19]:
obj = pd.Series([1,2,-5,3,-9])
obj

0    1
1    2
2   -5
3    3
4   -9
dtype: int64

In [20]:
obj.index = ["non","q","d","k",3]
obj

non    1
q      2
d     -5
k      3
3     -9
dtype: int64

DataFrame: A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be a different value type (numeric, string, Boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dictionary of Series all sharing the same index.

In [21]:
# One way of creating database: from a dictionary of equal-length lists or Numpy array 
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
data = pd.DataFrame(data)
data

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [22]:
# To select top five rows
data.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [23]:
# To select bottom five rows
data.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [24]:
# Specifying the sequence of columns for DataFrame
pd.DataFrame(data, columns=["pop","year","state"])

Unnamed: 0,pop,year,state
0,1.5,2000,Ohio
1,1.7,2001,Ohio
2,3.6,2002,Ohio
3,2.4,2001,Nevada
4,2.9,2002,Nevada
5,3.2,2003,Nevada


In [25]:
# If you pass a column that isn't in the dictionary, it will appear with missing values "NaN"
frame2 = pd.DataFrame(data, columns=["pop","year","state","extra"])
frame2

Unnamed: 0,pop,year,state,extra
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,
5,3.2,2003,Nevada,


In [26]:
frame2.columns

Index(['pop', 'year', 'state', 'extra'], dtype='object')

In [27]:
# Columns can we accessed in two ways: index based approach and dot approach
frame2["year"]

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [28]:
frame2.state # works only when column name doesn't conflict with any method name and doesn't contains whitspace or symbols other than undersore

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [29]:
# Column's values can be changed by assignment
frame2["extra"] = 7.7
frame2

Unnamed: 0,pop,year,state,extra
0,1.5,2000,Ohio,7.7
1,1.7,2001,Ohio,7.7
2,3.6,2002,Ohio,7.7
3,2.4,2001,Nevada,7.7
4,2.9,2002,Nevada,7.7
5,3.2,2003,Nevada,7.7


In [30]:
frame2["extra"] = np.arange(6.)
frame2

Unnamed: 0,pop,year,state,extra
0,1.5,2000,Ohio,0.0
1,1.7,2001,Ohio,1.0
2,3.6,2002,Ohio,2.0
3,2.4,2001,Nevada,3.0
4,2.9,2002,Nevada,4.0
5,3.2,2003,Nevada,5.0


In [31]:
# When assigning list or arrays to the columns, the values's length must match the length of DataFrame

# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, 
# inserting missing values in any index values not present
val = pd.Series([1,2,3], index = [2,4,5])
frame2["extra"] = val
frame2

Unnamed: 0,pop,year,state,extra
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,1.0
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,2.0
5,3.2,2003,Nevada,3.0


In [32]:
val = np.array([1,2,3,4,5,6])
frame2["array"] = val  # New columns can be created using index approach but not dot approach
frame2

Unnamed: 0,pop,year,state,extra,array
0,1.5,2000,Ohio,,1
1,1.7,2001,Ohio,,2
2,3.6,2002,Ohio,1.0,3
3,2.4,2001,Nevada,,4
4,2.9,2002,Nevada,2.0,5
5,3.2,2003,Nevada,3.0,6


In [33]:
# to delete
del frame2["array"]
frame2

Unnamed: 0,pop,year,state,extra
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,1.0
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,2.0
5,3.2,2003,Nevada,3.0


In [34]:
# DataFrame access using indexing is a view(with refernce), changes in it will be reflected to original
view_col = frame2["year"]
view_col[0] = 20000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  view_col[0] = 20000


In [35]:
frame2

Unnamed: 0,pop,year,state,extra
0,1.5,20000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,1.0
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,2.0
5,3.2,2003,Nevada,3.0


In [38]:
# Another way of creating DataFrame: nested dictionary. It consider outer keys-> columns, inner keys-> row indices
nested_dic = {"Delhi":{2000: 1.5, 2001: 1.7, 2002: 3.6},
              "MP":{2001: 2.4, 2002: 2.9}}
frame3 = pd.DataFrame(nested_dic)
frame3

Unnamed: 0,Delhi,MP
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [39]:
frame3.T
# Note that transposing discards the column data types if the columns do not all have the same data type, so transposing and then transposing back may lose the previous type information. The columns become arrays of pure Python objects in this case.

Unnamed: 0,2000,2001,2002
Delhi,1.5,1.7,3.6
MP,,2.4,2.9


In [40]:
# if index explicitly mentioned then inner keys do not become indices
pd.DataFrame(nested_dic,index=[2001,2002,2000])

Unnamed: 0,Delhi,MP
2001,1.7,2.4
2002,3.6,2.9
2000,1.5,


In [42]:
# We can give names to index wala column and columns wale row ko
frame3.index.name = "years"
frame3.columns.name = "State"
frame3

State,Delhi,MP
years,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [43]:
frame2.to_numpy() #returns values of all the columns in "2d array" format

array([[1.5, 20000, 'Ohio', nan],
       [1.7, 2001, 'Ohio', nan],
       [3.6, 2002, 'Ohio', 1.0],
       [2.4, 2001, 'Nevada', nan],
       [2.9, 2002, 'Nevada', 2.0],
       [3.2, 2003, 'Nevada', 3.0]], dtype=object)

Index Objects

In [53]:
df = pd.DataFrame(np.arange(4),index=["a","b","c","d"])
store_index = df.index
store_index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [50]:
store_index[0] = "q" # index are immutable, this property makes it safer to share indices among other dataframes

TypeError: Index does not support mutable operations

In [60]:
# dataframe can contain duplicate labels
indices = pd.Index(["poo","kaa","poo","kaa"])
indices

Index(['poo', 'kaa', 'poo', 'kaa'], dtype='object')

In [61]:
indices.append(indices)

Index(['poo', 'kaa', 'poo', 'kaa', 'poo', 'kaa', 'poo', 'kaa'], dtype='object')