In [3]:
#PANDAS - Python for Data Analysis

In [4]:
import numpy as np
import pandas as pd

In [5]:
s = pd.Series([2,4,5,np.nan, 6, 8])

In [6]:
s

0    2.0
1    4.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
#Create a dataframe, use a numpy array 
#use a datetime index and labeled columns
dates = pd.date_range("20210201", periods=6)
dates


DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
               '2021-02-05', '2021-02-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
#create a numpy array 
#store the array in a dataframe / table
#create a random, normal distribution using np.random.randn
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('1234'))

In [9]:
df

Unnamed: 0,1,2,3,4
2021-02-01,1.131753,-1.610166,-0.751632,-0.806864
2021-02-02,-0.58747,0.271732,0.243205,1.926345
2021-02-03,-1.0467,-1.036062,-1.55553,-1.056486
2021-02-04,-0.414037,1.14501,0.358714,1.137825
2021-02-05,0.66041,0.638407,-0.009714,0.878291
2021-02-06,-0.592546,-0.123236,-0.699158,0.547969


In [10]:
#Example 2
#Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [11]:
df2 = pd.DataFrame(
        {
            "A": 1.0,
            "B": pd.Timestamp("20130102"),
            "C": pd.Series(1, index=list(range(4)), dtype="float32"),
            "D": np.array([3] * 4, dtype="int32"),
            "E": pd.Categorical(["test", "train", "test", "train"]),
            "F": "foo",
       }
   )

In [12]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [13]:
#check the types 
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:
#Getting data from the dataframe
df['1']

2021-02-01    1.131753
2021-02-02   -0.587470
2021-02-03   -1.046700
2021-02-04   -0.414037
2021-02-05    0.660410
2021-02-06   -0.592546
Freq: D, Name: 1, dtype: float64

In [15]:
#slicing data 
df[0:2]

Unnamed: 0,1,2,3,4
2021-02-01,1.131753,-1.610166,-0.751632,-0.806864
2021-02-02,-0.58747,0.271732,0.243205,1.926345


In [16]:
df["20210202":"20210204"]

Unnamed: 0,1,2,3,4
2021-02-02,-0.58747,0.271732,0.243205,1.926345
2021-02-03,-1.0467,-1.036062,-1.55553,-1.056486
2021-02-04,-0.414037,1.14501,0.358714,1.137825


In [17]:
#Data Structures in PANDAS 
#Series
'''Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call:'''

'Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call:'

In [18]:
# s = pd.Series(data, index=index)
#Here data can be: A scalar value(like 5), an ndarray, a python dict 

In [19]:
s1 = pd.Series(5)
s1

0    5
dtype: int64

In [20]:
#From ndarray
#If data is an ndarray, index must be the same length as data. If no index is passed, one will be created having values [0, ..., len(data) - 1].
s2 = pd.Series(np.random.randn(5), index=["a","b","c","d","e"])
s2

a   -1.131421
b   -0.829516
c    0.383034
d    1.282467
e   -0.546773
dtype: float64

In [21]:
s2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [22]:
#Creating a Series with a python dictionary
dict1 = {"b": 1, "a":0, "c": 2}
dict1

{'b': 1, 'a': 0, 'c': 2}

In [26]:
pd.Series(dict1) 
#note the indexing 

b    1
a    0
c    2
dtype: int64

In [27]:
#we can pass in specific indexes
pd.Series(dict1, index=["b", "c", "d", "a"])
#notice the new indexing arrangement and the addition of 'd' results in NaN

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64