In [1]:
import pandas as pd
import numpy as np
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Datasets
mtcars = pd.read_csv('datasets/mtcars.csv')
salaries = pd.read_csv('datasets/Salaries.csv')
beml = pd.read_csv('datasets/BEML.csv')
glaxo = pd.read_csv('datasets/GLAXO.csv')
titanic = pd.read_csv('datasets/train.csv')

In [10]:
#----------pandas Data Structures---------
#Series
#DataFrame

# The Series, as you will see, constitutes the data structure designed to accommodate a sequence of 
# one-dimensional data, while the DataFrame, a more complex data structure, is designed to contain cases 
# with several dimensions.

In [18]:
#-----------Series--------------
# The Series is the object of the pandas library designed to represent one-dimensional data structures
# s = pd.Series([4,-5.6,8,23])
s = mtcars.disp
s
# As you can see from the output of the Series, on the left there are the values in the Index, which is a 
# series of labels, and on the right the corresponding values.

0     160.0
1     160.0
2     108.0
3     258.0
4     360.0
5     225.0
6     360.0
7     146.7
8     140.8
9     167.6
10    167.6
11    275.8
12    275.8
13    275.8
14    472.0
15    460.0
16    440.0
17     78.7
18     75.7
19     71.1
20    120.1
21    318.0
22    304.0
23    350.0
24    400.0
25     79.0
26    120.3
27     95.1
28    351.0
29    145.0
30    301.0
31    121.0
Name: disp, dtype: float64

In [23]:
s = pd.Series([4,-5.6,8,23],index=['a','b','c','d'])
s['a']
# If you want to individually see the two arrays that make up this data structure you can call the two 
# attributes of the Series as follows: index and values.
s.values
s.index
s['a':'c']

a    4.0
b   -5.6
c    8.0
dtype: float64

In [24]:
#Defining Series from NumPy Arrays and Other Series
#When doing this, however, you should always keep in mind that the values contained within the 
# NumPy array or the original Series are not copied, but are passed by reference.
arr = np.array([1,2,7,9])
s2 = pd.Series(arr)
s2

0    1
1    2
2    7
3    9
dtype: int32

In [30]:
#--------Filtering Values------
s = pd.Series([random.randint(1,20) for i in range(5)])
s[s < 10]

2    7
dtype: int64

In [34]:
#------Operations and Mathematical Functions--------
#(+,-,*,/)
df = pd.DataFrame(s)
df["+2"] = df[0] + 2
df["-2"] = df[0] - 2
df["*2"] = df[0] * 2
df["/2"] = df[0] / 2
df

Unnamed: 0,0,+2,-2,*2,/2
0,13,15,11,26,6.5
1,18,20,16,36,9.0
2,7,9,5,14,3.5
3,19,21,17,38,9.5
4,15,17,13,30,7.5


In [37]:
#---------------------------Evaluating Values----------------
serd = pd.Series([1,0,2,1,2,3], index=['white','white','blue','green','green','yellow'])
serd
# To know all the values contained within the Series excluding duplicates, you can use the unique() function. 
# The return value is an array containing the unique values in the Series, though not necessarily in order.
serd.unique()

# A function similar to unique() is the value_counts() function, which not only returns the unique values 
# but calculates occurrences within a Series.
serd.value_counts()

1    2
2    2
0    1
3    1
dtype: int64

In [62]:
# Finally, isin() is a function that evaluates the membership, that is, given a list of values, this function 
# lets you know if these values are contained within the data structure. Boolean values that are returned can be 
# very useful during the filtering of data within a series or in a column of a DataFrame.
# titanic[titanic.Age.isin([22])]
serd[serd.isin([0,3])]

white     0
yellow    3
dtype: int64

In [50]:
#--------------NaN Values
# This specific value NaN (Not a Number) is used within pandas data structures to indicate the 
# presence of an empty field or not definable numerically.

s = pd.Series([4,-5,6,-8])
np.log(s)

0    1.386294
1         NaN
2    1.791759
3         NaN
dtype: float64

In [53]:
# pandas allows to explicitly define and add this value in a data structure, such as Series.
#np.NaN
s = pd.Series([1,5,7,np.NaN,9,np.NaN])

# The isnull() and notnull() functions are very useful to identify the indexes without a value.
s.isnull()
s.notnull()


0     True
1     True
2     True
3    False
4     True
5    False
dtype: bool

In [57]:
#--------------Series as Dictionaries----------
#An alternative way to see a Series is to think of them as an object dict (dictionary).
dict = {"Avinash":31,"Vikas":28,"Manish":25}
s = pd.Series(dict)
s
#You can also define the array indexes separately.
names = ["Avinash","Sunita","Vikas","Manish","RSRai"]
s = pd.Series(dict,index=names)
s

Avinash    31.0
Sunita      NaN
Vikas      28.0
Manish     25.0
RSRai       NaN
dtype: float64

In [60]:
#----------------Operations between Series
#to perform arithmetic operations between Series and scalar values. The same thing is 
# possible by performing operations between two Series, but in this case even the labels come into play.
mydict1 = {'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}
mydict2 = {'red':400,'yellow':1000,'black':700}
s1 = pd.Series(mydict1)
s2 = pd.Series(mydict2)
s1 + s2
# You get a new object Series in which only the items with the same label are added. All other label 
# present in one of the two series are still added to the result but have a NaN value.

black        NaN
blue         NaN
orange       NaN
red       2400.0
yellow    1500.0
dtype: float64

In [63]:
#-----------------DataFrame---------------------------
# The DataFrame is a tabular data structure very similar to the Spreadsheet (the most familiar are Excel spreadsheets).
# Unlike Series, which had an Index array containing labels associated with each element, in the case 
# of the data frame, there are two index arrays. The first, associated with the lines, has very similar functions 
# to the index array in Series. In fact, each label is associated with all the values in the row. The second array 
# instead contains a series of labels, each associated with a particular column.

In [66]:
#-------Defining a DataFrame
data = {'color' : ['blue','green','yellow','red','white'],
        'object' : ['ball','pen','pencil','paper','mug'],
        'price' : [1.2,1.0,0.6,0.9,1.7]}
df = pd.DataFrame(data,index=[5,6,7,8,9])
df

Unnamed: 0,color,object,price
5,blue,ball,1.2
6,green,pen,1.0
7,yellow,pencil,0.6
8,red,paper,0.9
9,white,mug,1.7


In [67]:
# If the object dict from which we want to create a DataFrame contains more data than we are interested, 
# you can make a selection. In the constructor of the data frame, you can specify a sequence of columns, using 
# the columns option. The columns will be created in the order of the sequence regardless of how they are 
# contained within the object dict.
df2 = pd.DataFrame(data,columns=["object","color"])
df2

Unnamed: 0,object,color
0,ball,blue
1,pen,green
2,pencil,yellow
3,paper,red
4,mug,white


In [124]:
data = np.array([[14,4],[0,10]])
df = pd.DataFrame(data,index=["Athlete","Non-Athlete"],columns=["Non-Smoker","Smoker"])
df

Unnamed: 0,Non-Smoker,Smoker
Athlete,14,4
Non-Athlete,0,10


In [70]:
from scipy.stats import chi2_contingency
test,p,dof,expected = chi2_contingency(df)
print(test)

12.600000000000001


In [132]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
df

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [135]:
#-----------Selecting Elements-----------------
df.columns
df.index
# As regards the values contained within the data structure, you can get the entire set of data using the values attribute.
df.values
#Select column
df["pen"]
df.pen
# Regarding the rows within a data frame, it is possible to use the iloc attribute with the index value of the 
# row that you want to extract.
df.iloc[2]
#The object returned is again a Series, in which the names of the columns have become the label of the 
# array index, whereas the values have become the data of Series.
df.iloc[[0,2]]
df[0:1]

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3


In [110]:
#---------------------Assigning Values-----------------------
data = {'color' : ['blue','green','yellow','red','white'],
        'object' : ['ball','pen','pencil','paper','mug'],
        'price' : [1.2,1.0,0.6,0.9,1.7]}
df = pd.DataFrame(data)
df.index.name = "id"
df.columns.name = "item"
df
#Add new column
df["new"] = [3.0,1.3,2.2,0.8,1.1]
df

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,3.0
1,green,pen,1.0,1.3
2,yellow,pencil,0.6,2.2
3,red,paper,0.9,0.8
4,white,mug,1.7,1.1


In [112]:
#------------Membership of a Value--------------
df[df.isin(['pen',1])]

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,,pen,1.0,
2,,,,
3,,,,
4,,,,


In [113]:
#---------------Deleting a Column
del df["new"]
df

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [118]:
#------------------Filtering----------------
df = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
df[df < 12]
# You will get as returned object a DataFrame containing values less than 12, keeping their original 
# position. All others will be replaced with NaN.

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,2.0,3.0
blue,4.0,5.0,6.0,7.0
yellow,8.0,9.0,10.0,11.0
white,,,,


In [121]:
#----------------------DataFrame from Nested dict-------------
nestdict = { 'red': { 2012: 22, 2013: 33 },
            'white': { 2011: 13, 2012: 22, 2013: 16},
            'blue': {2011: 17, 2012: 27, 2013: 18}}

# This data structure, when it is passed directly as an argument to the DataFrame() constructor, will be 
# interpreted by pandas so as to consider external keys as column names and internal keys as labels for the 
# indexes.
# During the interpretation of the nested structure, it is possible that not all fields find a successful match. 
# pandas will compensate for this inconsistency by adding the value NaN values missing.
df = pd.DataFrame(nestdict)
df

Unnamed: 0,red,white,blue
2012,22.0,22,27
2013,33.0,16,18
2011,,13,17


In [122]:
#---------------------- Transposition of a DataFrame
# An operation that might be needed when dealing with tabular data structures is the transposition (that is, 
# the columns become rows and rows columns). pandas allows you to do this in a very simple way. You can 
# get the transpose of the data frame by adding the T attribute to its application.
df.T

Unnamed: 0,2012,2013,2011
red,22.0,33.0,
white,22.0,16.0,13.0
blue,27.0,18.0,17.0


In [127]:
#------The Index Objects
# Unlike all other elements within pandas data structures (Series and data frame), the Index objects are 
# immutable objects. Once declared, these cannot be changed. This ensures their secure sharing between the 
# various data structures.
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])

In [130]:
#--------Methods on Index
#idmin() and idmax() are two functions that return, respectively, the index with the lowest value and more.
ser.idxmin()
ser.idxmax()

'white'

In [3]:
#------------Index with Duplicate Labels------------------------------S
serd = pd.Series(range(6), index=['white','white','blue','green','green','yellow'])
serd["white"]
# Just in this respect, pandas provides you with the is_unique attribute belonging to the Index objects.
# This attribute will tell you if there are indexes with duplicate labels inside the structure data (both Series and DataFrame).
serd.index.is_unique

False

In [4]:
#-----------Other Functionalities on Indexes------------------------

In [5]:
#---------------Reindexing----------------
# It was previously stated that once declared within a data structure, the Index object cannot be changed. 
# This is true, but by executing a reindexing you can also overcome this problem.
ser = pd.Series([2,5,7,4], index=['one','two','three','four'])
ser

one      2
two      5
three    7
four     4
dtype: int64

In [7]:
# In order to make the reindexing of this series, pandas provides you with the reindex() function. 
# This function creates a new Series object with the values of the previous Series rearranged according to the 
# new sequence of labels.


# During this operation of reindexing, it is therefore possible to change the order of the sequence of indexes, 
# delete some of them, or add new ones. In the case of a new label, pandas add NaN as corresponding value.
ser.reindex(['three','four','five','one'])

three    7.0
four     4.0
five     NaN
one      2.0
dtype: float64

In [31]:
#Check this method is not working ,check on vs code

In [32]:
# some method that allows you to fill or interpolate values automatically.
ser3 = pd.Series([1,5,6,3],index=[0,3,5,6])
ser3

0    1
3    5
5    6
6    3
dtype: int64

In [33]:
ser3.reindex(np.arange(6),method="ffill")
ser3

0    1
3    5
5    6
6    3
dtype: int64

In [21]:
ser3.reindex(np.arange(6),method="bfill")
ser3

0    1
3    5
5    6
6    3
dtype: int64

In [38]:
#------------Dropping-------------
# pandas provides a specific function for this operation: drop(). This method will return a new object without
# the items that you want to delete.
ser = pd.Series(np.arange(4),index=['red','blue','yellow','white'])
ser.drop(["blue","white"])

red       0
yellow    2
dtype: int32

In [42]:
# Regarding the DataFrame, instead, the values can be deleted by referring to the labels of both axes.
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=['red','blue','yellow','white'],
                  columns=['ball','pen','pencil','paper'])
df

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [43]:
# To delete rows, just pass the indexes of the rows.
df.drop(["red","blue"])

Unnamed: 0,ball,pen,pencil,paper
yellow,8,9,10,11
white,12,13,14,15


In [45]:
# To delete columns, you always need to specify the indexes of the columns, but you must specify the 
# axis from which to delete the elements, and this can be done using the axis option. So to refer to the column 
# names you should specify axis = 1.
df.drop(["ball","pen"],axis=1)

Unnamed: 0,pencil,paper
red,2,3
blue,6,7
yellow,10,11
white,14,15


In [48]:
#-----------------------Arithmetic and Data Alignment--------------
# the most powerful feature involving the indexes in a data structure, is that pandas is able to perform 
# the alignment of the indexes coming from two different data structures.
s1 = pd.Series([3,2,5,1],['white','yellow','green','blue'])
s2 = pd.Series([1,4,7,2,1],['white','yellow','black','blue','brown'])
s1

white     3
yellow    2
green     5
blue      1
dtype: int64

In [49]:
s2

white     1
yellow    4
black     7
blue      2
brown     1
dtype: int64

In [50]:
# consider the simple sum.
# Well, when the labels are present in both operators, their values will be added, while in the opposite case, 
# they will also be shown in the result (new series), but with the value NaN.
s1 + s2

black     NaN
blue      3.0
brown     NaN
green     NaN
white     4.0
yellow    6.0
dtype: float64

In [51]:
# In the case of the data frame, although it may appear more complex, the alignment follows the same 
# principle, but is carried out both for the rows and for the columns.
df1 = pd.DataFrame(np.arange(16).reshape((4,4)),
                      index=['red','blue','yellow','white'],
                      columns=['ball','pen','pencil','paper'])
df2 = pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=['blue','green','white','yellow'],
                   columns=['mug','pen','ball'])

df1

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [53]:
df1 + df2

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [54]:
#-------------Operations between Data Structures----------------

In [58]:
#--------------Flexible Arithmetic Methods-------------------
#We seen how to use mathematical operators directly on the pandas data structures. The same 
# operations can also be performed using appropriate methods, called Flexible arithmetic methods.
#add()
#sub()
#div()
#mul()
df1.add(df2)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [63]:
#Operations between DataFrame and Series
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=['red','blue','yellow','white'],
                  columns=['ball','pen','pencil','paper'])
ser = pd.Series(np.arange(4), index=['ball','pen','pencil','paper'])
df - ser
#If an index is not present in one of the two data structures, the result will be a new column with that 
# index only that all its elements will be NaN.

Unnamed: 0,ball,pen,pencil,paper
red,0,0,0,0
blue,4,4,4,4
yellow,8,8,8,8
white,12,12,12,12


In [64]:
#--------------Function Application and Mapping------------------------

In [65]:
#----Functions by Element

In [66]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['red','blue','yellow','white'],
                     columns=['ball','pen','pencil','paper'])


In [67]:
#For example you could calculate the square root of each value within the data frame, using the NumPy np.sqrt().
np.sqrt(df)

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [70]:
#-------Functions by Row or Column-----
# The important thing is that they operate on a one-dimensional array, giving a single number for result.
df

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [72]:
f = lambda x : x.max() - x.min()
# Using the apply() function you can apply the function just defined on the DataFrame.
df.apply(f)

ball      12
pen       12
pencil    12
paper     12
dtype: int64

In [73]:
#The result, however, this time it is only one value for the column, but if you prefer to apply the function 
# by row instead of by column, you have to specify the axis option set to 1.
df.apply(f,axis=1)

red       3
blue      3
yellow    3
white     3
dtype: int64

In [74]:
def f(x):
    return pd.Series([x.count(),x.max(),x.min(),x.mean(),x.var(),x.std(),],index=["count","max","min","mean","var","std"])
df.apply(f)

Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
max,12.0,13.0,14.0,15.0
min,0.0,1.0,2.0,3.0
mean,6.0,7.0,8.0,9.0
var,26.666667,26.666667,26.666667,26.666667
std,5.163978,5.163978,5.163978,5.163978


In [75]:
#------------------Statistics Functions-----------
df

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [76]:
df.sum()

ball      24
pen       28
pencil    32
paper     36
dtype: int64

In [77]:
df.mean()

ball      6.0
pen       7.0
pencil    8.0
paper     9.0
dtype: float64

In [78]:
# There is also a function called describe() that allows to obtain a summary statistics at once.
df.describe()

Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


In [79]:
#-------------------Sorting and Ranking-------------------------

In [80]:
#Pandas provides the sort_index() function that returns a new object which is identical to the start, 
# but in which the elements are ordered.
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [82]:
ser.sort_index(ascending=False)

yellow    3
white     8
red       5
green     4
blue      0
dtype: int64

In [84]:
#As regards the DataFrame, the sorting can be performed independently on each of its two axes. So 
# if you want to order by row following the indexes, just continue to use the function sort_index() without 
# arguments as you’ve seen before, or if you prefer to order by columns, you will need to use the axis options 
# set to 1.
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=['red','blue','yellow','white'],
                  columns=['ball','pen','pencil','paper'])
#for row
df.sort_index(axis=0) #by default
#for column
df.sort_index(axis=1)

Unnamed: 0,ball,paper,pen,pencil
red,0,3,1,2
blue,4,7,5,6
yellow,8,11,9,10
white,12,15,13,14


In [88]:
#For Series
#If you want to order the series, you will use the order() function.
ser.order()

In [93]:
# If you need to order the values in a DataFrame, you will use the sort_values() function seen previously 
# but with the by option. Then you have to specify the name of the column on which to sort.
df.sort_values(by=["pen","pencil"])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [100]:
df = pd.DataFrame(np.array([random.randint(10,50) for i in range(16)]).reshape(4,4),
                  index=['red','blue','yellow','white'],
                  columns=['ball','pen','pencil','paper'])
df.sort_values(by=["pen","pencil"])

Unnamed: 0,ball,pen,pencil,paper
blue,15,21,28,38
white,20,22,29,38
red,11,28,31,45
yellow,19,38,15,26


In [107]:
#The ranking is an operation closely related to sorting. It mainly consists of assigning a rank (that is, a 
# value that starts at 0 and then increase gradually) to each element of the series. The rank will be assigned 
# starting from the lowest value to the highest value.
ser = pd.Series([random.randint(10,50) for i in range(10)])
ser.rank(ascending = False)
#The rank can also be assigned in the order in which the data are already in the data structure (without a 
# sorting operation). In this case, just add the method option with the ‘first’ value assigned.

0     4.0
1     1.0
2     5.0
3     6.0
4     7.5
5    10.0
6     3.0
7     2.0
8     9.0
9     7.5
dtype: float64

In [108]:
#-------------Correlation and Covariance---------------
# Two important statistical calculations are correlation and covariance, expressed in pandas by the corr() and 
# cov() functions. These kind of calculations normally involve two Series.
mtcars.head(2)


Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [112]:
seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq1 = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq1.corr(seq2)
seq1.cov(seq2)

0.8571428571428571

In [115]:
mtcars.mpg.cov(mtcars.disp)
mtcars.mpg.corr(mtcars.disp)

-633.0972076612901

In [118]:
#Another case could be that covariance and correlation are applied to a single DataFrame. In this case, 
# they return their corresponding matrices in form of two new DataFrame objects.
df = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],
                   index=['red','blue','yellow','white'],
                   columns=['ball','pen','pencil','paper'])
df.corr()
df.cov()

Unnamed: 0,ball,pen,pencil,paper
ball,2.0,-0.666667,2.0,-2.333333
pen,-0.666667,2.916667,-0.333333,-1.333333
pencil,2.0,-0.333333,6.0,-3.666667
paper,-2.333333,-1.333333,-3.666667,4.666667


In [None]:
#Using the method corrwith(), you can calculate the pairwise correlations between the columns or rows 
# of a data frame with a Series or another DataFrame().

In [119]:
#----------------------------“Not a Number” Data------------------------------------

In [120]:
#-----Assigning a NaN Value
# Just in case you would like to specifically assign a NaN value to an element in a data structure, you can use 
# the value np.NaN(or np.nan) of the NumPy library.
ser = pd.Series([0,1,2,np.NaN,9], index=['red','blue','yellow','white','green'])
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

In [122]:
#----------------Filtering Out NaN Values---------------
#The dropna() function 
ser.dropna()

#Another possibility is to directly perform the filtering function by placing the notnull() in the selection condition.

red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64

In [125]:
df = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],
                      index = ['blue','green','red'],
                      columns = ['ball','mug','pen'])
df

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
green,,,
red,2.0,,5.0


In [126]:
df.dropna()

Unnamed: 0,ball,mug,pen


In [132]:
#Therefore to avoid having entire rows and columns disappear completely, you should specify the how
# option, assigning a value of ‘all’ to it, in order to inform the dropna() function to delete only the rows or 
# columns in which all elements are NaN.
df.dropna(how="all")

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
red,2.0,,5.0


In [136]:
#-----------------Filling in NaN Occurrences------------------
#the fillna() function. This method takes one argument, the value with which to replace any NaN. 
# It can be the same for all, as in the following case.
df.fillna("NA")
#Or you can replace NaN with different values depending on the column,specifying one by one the indexes and the associated value.
df.fillna({'ball':1,'mug':0,'pen':99})

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,1.0,0.0,99.0
red,2.0,0.0,5.0


In [141]:
#---------Hierarchical Indexing and Leveling------------
mser = pd.Series(np.random.rand(8),
                 index=[['white','white','white','blue','blue','red','red','red'],
                        ['up','down','right','up','down','up','down','left']])
mser

white  up       0.899309
       down     0.084252
       right    0.386028
blue   up       0.825190
       down     0.918651
red    up       0.003473
       down     0.647399
       left     0.405539
dtype: float64

In [142]:
# The hierarchical indexing plays a critical role in reshaping the data and group-based operations such 
# as creating a pivot-table. For example, the data could be used just rearranged in a data frame using a special 
# function called unstack(). This function converts the Series with hierarchical index in a simple DataFrame, 
# where the second set of indexes is converted into a new set of columns.

mser.unstack()

Unnamed: 0,down,left,right,up
blue,0.918651,,,0.82519
red,0.647399,0.405539,,0.003473
white,0.084252,,0.386028,0.899309


In [161]:
#If what we want is to perform the reverse operation, which is to convert a DataFrame in a Series, you will 
# use the stack() function.
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index=['red','blue','yellow','white'],
                  columns=['ball','pen','pencil','paper'])
mf = pd.DataFrame(df.stack(),columns=["values"])
mf.index.names = ["color","object"]
mf

Unnamed: 0_level_0,Unnamed: 1_level_0,values
color,object,Unnamed: 2_level_1
red,ball,0
red,pen,1
red,pencil,2
red,paper,3
blue,ball,4
blue,pen,5
blue,pencil,6
blue,paper,7
yellow,ball,8
yellow,pen,9


In [162]:
# As regards the DataFrame, it is possible to define a hierarchical index both for the rows and for the 
# columns. At the time of the declaration of the DataFrame, you have to define an array of arrays for both the 
# index option and the columns option.
mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),
                      index=[['white','white','red','red'], ['up','down','up','down']],
                      columns=[['pen','pen','paper','paper'],[1,2,1,2]])
mframe

Unnamed: 0_level_0,Unnamed: 1_level_0,pen,pen,paper,paper
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,1,2
white,up,2.010797,-0.799066,-1.215616,1.362278
white,down,-0.642272,-0.642475,-0.720428,-0.600966
red,up,0.720305,-0.911771,0.44079,0.756192
red,down,-1.090109,-0.307934,0.467733,0.846901


In [165]:
#-------Reordering and Sorting Levels--------------------
# The swaplevel() function accepts as argument the names assigned to the two levels that you want to 
# interchange, and returns a new object with the two levels interchanged between them, while leaving the data 
# unmodified.
mframe.columns.names = ['objects','id']
mframe.index.names = ['colors','status']
mframe.swaplevel('colors','status')

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
status,colors,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
up,white,2.010797,-0.799066,-1.215616,1.362278
down,white,-0.642272,-0.642475,-0.720428,-0.600966
up,red,0.720305,-0.911771,0.44079,0.756192
down,red,-1.090109,-0.307934,0.467733,0.846901


In [172]:
#Instead, the sortlevel() function orders the data considering only those of a certain level.
mframe

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
colors,status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
white,up,2.010797,-0.799066,-1.215616,1.362278
white,down,-0.642272,-0.642475,-0.720428,-0.600966
red,up,0.720305,-0.911771,0.44079,0.756192
red,down,-1.090109,-0.307934,0.467733,0.846901


In [169]:
#-------------Summary Statistic by Level
# Many descriptive statistics and summary statistics performed on a DataFrame or on a Series have a level
# option, with which you can determine at what level the descriptive and summary statistics should be 
# determined.

In [171]:
mframe.sum(level="colors")

objects,pen,pen,paper,paper
id,1,2,1,2
colors,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
white,1.368525,-1.441541,-1.936044,0.761312
red,-0.369804,-1.219705,0.908524,1.603093


In [173]:
#If you want to make a statistic for a given level of the column, for example, the id, you must specify the 
# second axis as argument through the axis option set to 1.
mframe.sum(level="id",axis=1)

Unnamed: 0_level_0,id,1,2
colors,status,Unnamed: 2_level_1,Unnamed: 3_level_1
white,up,0.795181,0.563212
white,down,-1.3627,-1.243441
red,up,1.161095,-0.155579
red,down,-0.622375,0.538967
