#Pandas

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame(np.arange(0,20).reshape(5,4), index=['Row1','Row2','Row3','Row4','Row5'], columns=['Col1','Col2','Col3','Col4'])

In [4]:
df.head()

Unnamed: 0,Col1,Col2,Col3,Col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [5]:
df.to_csv('FirstDataFrame.csv')

In [6]:
# To access the elements
# 1. .loc : only columns show krta h. input me sirf row leta h. FOR DATA SERIES
# 2. .iloc : both columns and row. FOR DATAFRAMES
# DATA SERIES is either 1 row and multiple columns OR vice versa

df.loc['Row1']

Col1    0
Col2    1
Col3    2
Col4    3
Name: Row1, dtype: int32

In [7]:
type(df.loc['Row1'])

pandas.core.series.Series

In [8]:
df.iloc[:,:]

Unnamed: 0,Col1,Col2,Col3,Col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [9]:
type(df.iloc[:,:])

pandas.core.frame.DataFrame

In [10]:
# To convert DataFrames into array
df.iloc[:,:].values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [12]:
# To find any null values
# it returns boolean values
df.isnull()

Unnamed: 0,Col1,Col2,Col3,Col4
Row1,False,False,False,False
Row2,False,False,False,False
Row3,False,False,False,False
Row4,False,False,False,False
Row5,False,False,False,False


In [13]:
# To count how many null values present we can simply use sum
df.isnull().sum()

Col1    0
Col2    0
Col3    0
Col4    0
dtype: int64

In [14]:
# Works only for columns
df['Col3'].value_counts()

Col3
2     1
6     1
10    1
14    1
18    1
Name: count, dtype: int64

In [15]:
# Gives unique values
df['Col1'].unique()

array([ 0,  4,  8, 12, 16])

In [16]:
# To retrieve
df['Col1'] 

Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: Col1, dtype: int32

In [17]:
# To retrieve multiple column
df[['Col1','Col2','Col3']]

Unnamed: 0,Col1,Col2,Col3
Row1,0,1,2
Row2,4,5,6
Row3,8,9,10
Row4,12,13,14
Row5,16,17,18


In [20]:
# io library allows us to use input output data
from io import StringIO, BytesIO

In [21]:
data = ('col1,col2,col3\n'
        'x,y,1\n'
        'a,b,2\n'
        'c,d,3')

In [22]:
type(data)

str

In [27]:
pd.read_csv(StringIO(data),sep = ',')
# OR
# pd.read_csv(StringIO(data),sep = ',')
# OR
# pd.read_csv(StringIO(data),sep = ':') This will give different result

Unnamed: 0,"col1,col2,col3"
0,"x,y,1"
1,"a,b,2"
2,"c,d,3"


In [28]:
pd.read_csv(StringIO(data),usecols=['col1','col3'])

Unnamed: 0,col1,col3
0,x,1
1,a,2
2,c,3


In [29]:
data = ('a,b,c\n'
        '1,2,3\n'
        '4,5,6\n'
        '7,8,9\n')

In [30]:
#creating table with particular data type we use dtype
df = pd.read_csv(StringIO(data),dtype=object)
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [31]:
type(df['a'][1])

str

In [32]:
df = pd.read_csv(StringIO(data),dtype={'a':int,'b':float})
df

Unnamed: 0,a,b,c
0,1,2.0,3
1,4,5.0,6
2,7,8.0,9


In [34]:
# To change the indexing acc to you from default 0,1,2
df = pd.read_csv(StringIO(data),index_col=0)
df

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,3
4,5,6
7,8,9


In [36]:
df = pd.read_csv(StringIO(data),index_col=False)
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [38]:
df = pd.read_csv(StringIO(data),usecols=['a','b'],index_col=1)
df

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
2,1
5,4
8,7


In [39]:
# Quoting and Escape characters, Useful in NLP
pd.read_csv(StringIO(data),escapechar=',')

Unnamed: 0,abc
0,123
1,456
2,789


In [None]:
# \t means tab. Removing tab from a file with url = URL
pd.read_csv("URL",sep='\t')

JSON files
Excel files
Pickling

All pandas object are equipped with to_pickle methods which use Python's cPickle module to save data structures to disk using the pickle format. Used for large data set, and thus to avoid preprocesing

In [77]:
df = pd.read_csv(StringIO(data))
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [80]:
df1 = df.to_pickle()

TypeError: string argument expected, got 'bytes'