In [49]:
#PANDAS - Python for Data Analysis

In [50]:
import numpy as np
import pandas as pd

In [51]:
s = pd.Series([2,4,5,np.nan, 6, 8])

In [52]:
s

0    2.0
1    4.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [53]:
#Create a dataframe, use a numpy array 
#use a datetime index and labeled columns
dates = pd.date_range("20210201", periods=6)
dates


DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
               '2021-02-05', '2021-02-06'],
              dtype='datetime64[ns]', freq='D')

In [54]:
#create a numpy array 
#store the array in a dataframe / table
#create a random, normal distribution using np.random.randn
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('1234'))

In [55]:
df

Unnamed: 0,1,2,3,4
2021-02-01,0.190421,-0.546938,0.773508,-0.364814
2021-02-02,-1.652731,-0.281049,-0.356523,-0.942504
2021-02-03,0.603306,-1.683981,-1.965389,-0.066433
2021-02-04,-0.583965,-0.172978,-0.066462,0.018143
2021-02-05,0.311781,-0.348139,0.656974,0.271699
2021-02-06,0.828362,-1.088297,-1.318383,0.250793


In [56]:
#Example 2
#Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [57]:
df2 = pd.DataFrame(
        {
            "A": 1.0,
            "B": pd.Timestamp("20130102"),
            "C": pd.Series(1, index=list(range(4)), dtype="float32"),
            "D": np.array([3] * 4, dtype="int32"),
            "E": pd.Categorical(["test", "train", "test", "train"]),
            "F": "foo",
       }
   )

In [58]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [59]:
#check the types 
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [60]:
#Getting data from the dataframe
df['1']

2021-02-01    0.190421
2021-02-02   -1.652731
2021-02-03    0.603306
2021-02-04   -0.583965
2021-02-05    0.311781
2021-02-06    0.828362
Freq: D, Name: 1, dtype: float64

In [61]:
#slicing data 
df[0:2]

Unnamed: 0,1,2,3,4
2021-02-01,0.190421,-0.546938,0.773508,-0.364814
2021-02-02,-1.652731,-0.281049,-0.356523,-0.942504


In [62]:
df["20210202":"20210204"]

Unnamed: 0,1,2,3,4
2021-02-02,-1.652731,-0.281049,-0.356523,-0.942504
2021-02-03,0.603306,-1.683981,-1.965389,-0.066433
2021-02-04,-0.583965,-0.172978,-0.066462,0.018143


In [63]:
#Data Structures in PANDAS 
#Series
'''Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call:'''

'Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call:'

In [64]:
# s = pd.Series(data, index=index)
#Here data can be: A scalar value(like 5), an ndarray, a python dict 

In [65]:
s1 = pd.Series(5)
s1

0    5
dtype: int64

In [66]:
#From ndarray
#If data is an ndarray, index must be the same length as data. If no index is passed, one will be created having values [0, ..., len(data) - 1].
s2 = pd.Series(np.random.randn(5), index=["a","b","c","d","e"])
s2

a   -0.618348
b   -0.189350
c   -0.949201
d   -1.909465
e    0.647933
dtype: float64

In [67]:
s2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [68]:
#Creating a Series with a python dictionary
dict1 = {"b": 1, "a":0, "c": 2}
dict1

{'b': 1, 'a': 0, 'c': 2}

In [69]:
pd.Series(dict1) 
#note the indexing 

b    1
a    0
c    2
dtype: int64

In [70]:
#we can pass in specific indexes
pd.Series(dict1, index=["b", "c", "d", "a"])
#notice the new indexing arrangement and the addition of 'd' results in NaN

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [71]:
#DataFrames 
#Creating and Viewing the Data 

In [72]:
df3 = pd.DataFrame(np.random.rand(10,4)*10, index=dates, columns=list('ABCD'))

ValueError: Shape of passed values is (4, 10), indices imply (4, 6)

In [40]:
#a random table of values  
df3

Unnamed: 0,A,B,C,D
2021-02-01,9.065516,0.166762,1.559629,4.752614
2021-02-02,5.265075,0.502876,1.678119,6.734184
2021-02-03,5.348717,5.636934,3.921565,4.462398
2021-02-04,1.60719,4.812389,0.147881,0.216569
2021-02-05,3.292918,0.523858,8.121086,9.365255
2021-02-06,0.353386,5.316543,0.355391,1.627293


In [43]:
#view the first 5 or last five rows using head() or tail()
#df3.head()
df3.tail()

Unnamed: 0,A,B,C,D
2021-02-02,5.265075,0.502876,1.678119,6.734184
2021-02-03,5.348717,5.636934,3.921565,4.462398
2021-02-04,1.60719,4.812389,0.147881,0.216569
2021-02-05,3.292918,0.523858,8.121086,9.365255
2021-02-06,0.353386,5.316543,0.355391,1.627293


In [44]:
#display the index columns
df3.index

DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
               '2021-02-05', '2021-02-06'],
              dtype='datetime64[ns]', freq='D')

In [45]:
#display the column headers 
df3.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [73]:
#get overall info on your data using the describe method 
df2.describe

<bound method NDFrame.describe of      A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo>

In [74]:
df.T #to transpose your data. ie switch rows to cols and cols to rows 

Unnamed: 0,2021-02-01 00:00:00,2021-02-02 00:00:00,2021-02-03 00:00:00,2021-02-04 00:00:00,2021-02-05 00:00:00,2021-02-06 00:00:00
1,0.190421,-1.652731,0.603306,-0.583965,0.311781,0.828362
2,-0.546938,-0.281049,-1.683981,-0.172978,-0.348139,-1.088297
3,0.773508,-0.356523,-1.965389,-0.066462,0.656974,-1.318383
4,-0.364814,-0.942504,-0.066433,0.018143,0.271699,0.250793
