# **Module**: Python Installation and Basics
## **Lecture**: Scientific Computing Package - Pandas

> ## Pandas introduction

In [18]:
# create a series (1D structure)
import pandas as pd
import numpy as np

data = [10,8,6]
s = pd.Series(data) # can pass numpy array as well
print('series s:\n', s)

data2 = np.array([1,1,1])
s2 = pd.Series(data2)
print('\n')
print('series s2:\n', s2)

series s:
 0    10
1     8
2     6
dtype: int64


series s2:
 0    1
1    1
2    1
dtype: int32


In [19]:
# get information about series s
print('Number of items in series s: ', s.size)
print('Shape of series s: ', s.shape) # number of items along each dimension

Number of items in series s:  3
Shape of series s:  (3,)


In [20]:
# create a dataframe (2D structure)
data = [[1,10],[1,8],[1,6]]
df = pd.DataFrame(data, columns=['id', 'value']) # can pass a 2D Numpy array as well
print('dataframe df:\n', df)

# dataframe from multiple series
df2 = pd.DataFrame({'id':s2, 'value':s}) # can pass 1D Numpy arrays as well in place of series
print('\n')
print('dataframe df:\n', df) # same as above

dataframe df:
    id  value
0   1     10
1   1      8
2   1      6


dataframe df:
    id  value
0   1     10
1   1      8
2   1      6


In [21]:
# get information about dataframe df
print('Number of items in dataframe df: ', df.size)
print('Shape of dataframe df: ', df.shape) # number of items along each dimension

Number of items in dataframe df:  6
Shape of dataframe df:  (3, 2)


> ## Pandas to Numpy 

In [22]:
# Series to 1D Numpy array
arr_1D = s.values
print('arr_1D: \n', arr_1D)

# Dataframe to 2D Numpy array
arr_2D = df.values
print('arr_2D: \n', arr_2D)

arr_1D: 
 [10  8  6]
arr_2D: 
 [[ 1 10]
 [ 1  8]
 [ 1  6]]


> ## Data Access

In [23]:
df = pd.DataFrame(np.array([[1,10],[1,8],[1,6]]), 
                  columns=['id', 'value'], index=[101, 102, 103]) 
print('dataframe df:\n', df)

dataframe df:
      id  value
101   1     10
102   1      8
103   1      6


In [24]:
# individual item selection
print(df.loc[102, 'value'])
print(df.iloc[1, 1]) # same as above

8
8


In [25]:
# column(s) selection
print(df['id']) # returns column 'id' as a series
print(df.id) # same as above
print(df[['id']]) # returns specified columns in the list as a dataframe

101    1
102    1
103    1
Name: id, dtype: int32
101    1
102    1
103    1
Name: id, dtype: int32
     id
101   1
102   1
103   1


In [26]:
# row(s) selection
print(df.loc[101]) # returns 2nd row as a series; can provide a list for multiple rows selection
print('\n')
print(df.iloc[0]) # integer location-based selection; same result as above

print('\n')
print(df.loc[[101,102]]) # returns a dataframe
print('\n')
print(df.iloc[0:2]) # same result as above

id        1
value    10
Name: 101, dtype: int32


id        1
value    10
Name: 101, dtype: int32


     id  value
101   1     10
102   1      8


     id  value
101   1     10
102   1      8


> ## Data Slicing

In [27]:
df = pd.DataFrame(np.array([[1,10,3,4],[1,8,2,6],[1,6,0,0],[0,0,3,1]]), 
                  columns=['var1', 'var2', 'var3', 'var4'])
df

Unnamed: 0,var1,var2,var3,var4
0,1,10,3,4
1,1,8,2,6
2,1,6,0,0
3,0,0,3,1


In [28]:
# Select var2 and var3 data from rows 1 to 3
print(df.iloc[1:, [1,2]])
print(df.iloc[1:, 1:3]) # same as above
print(df.loc[1:, ['var2', 'var3']]) # same as above

   var2  var3
1     8     2
2     6     0
3     0     3
   var2  var3
1     8     2
2     6     0
3     0     3
   var2  var3
1     8     2
2     6     0
3     0     3


> ## Data Manipulation

In [29]:
df = pd.DataFrame(np.array([[1,1,10,3,4],[1,1,8,2,6],[1,1,6,0,0],[2,0,0,3,1],[2,3,3,3,0],[2,2,6,1,0]]), 
                  columns=['trial #', 'var1', 'var2', 'var3', 'var4'])
df

Unnamed: 0,trial #,var1,var2,var3,var4
0,1,1,10,3,4
1,1,1,8,2,6
2,1,1,6,0,0
3,2,0,0,3,1
4,2,3,3,3,0
5,2,2,6,1,0


In [30]:
# filtering: Keep data only from trial # 2
df_filtered = df[df['trial #']==2]
print(df_filtered)

   trial #  var1  var2  var3  var4
3        2     0     0     3     1
4        2     3     3     3     0
5        2     2     6     1     0


In [36]:
# Agrgregation: Find mean of each variable across all trials
meanValues = df.iloc[:,1:].mean() # mean of all columns except the first one
print('\n')
print(meanValues)

meanValues = df[['var1', 'var2', 'var3']].mean()
print(meanValues)


print('\n')
print(meanValues['var1'])



var1    1.333333
var2    5.500000
var3    2.000000
var4    1.833333
dtype: float64


1.3333333333333333


In [32]:
# Grouping: Find trial-wise mean of each variable
meanValues_byTrial = df.groupby('trial #').mean()
print(meanValues_byTrial)

print('\n')
print(meanValues_byTrial.loc[1, 'var2'])

             var1  var2      var3      var4
trial #                                    
1        1.000000   8.0  1.666667  3.333333
2        1.666667   3.0  2.333333  0.333333


8.0


> ## File I/O and Data Summary

In [33]:
# read from excel
excelData = pd.read_excel('excelDataFile.xlsx')
excelData.head()

Unnamed: 0,Sr. No.,Category,Value
0,1,low,0.1
1,2,medium,0.6
2,3,medium,0.6
3,4,low,0.15
4,5,high,0.9


In [34]:
excelData.describe(include='all')

Unnamed: 0,Sr. No.,Category,Value
count,5.0,5,5.0
unique,,3,
top,,low,
freq,,2,
mean,3.0,,0.47
std,1.581139,,0.338378
min,1.0,,0.1
25%,2.0,,0.15
50%,3.0,,0.6
75%,4.0,,0.6
