In [2]:
import numpy as np
import pandas as pd
import random
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Datasets
mtcars = pd.read_csv('datasets/mtcars.csv')
salaries = pd.read_csv('datasets/Salaries.csv')
beml = pd.read_csv('datasets/BEML.csv')
glaxo = pd.read_csv('datasets/GLAXO.csv')
titanic = pd.read_csv('datasets/train.csv')

In [4]:
# The manipulation of the data has the purpose of preparing the data so that they can be more easily subjected to analysis
#The three phases of data manipulation :-
# Data preparation
# Data transformation
# Data aggregation

In [5]:
#----------------------Data Preparation----------------------
#The different procedures for data preparation
# loading
# assembling :- (merging,concatenating,combining)
# reshaping (pivoting)
# removing

In [6]:
#The data contained in the pandas objects can be assembled together in different ways:
#Merging—the pandas.merge( ) function connects the rows in a DataFrame based on one or more keys. 
# This mode is very familiar to those who are confident with the SQL language, since it also implements join operations.

#Concatenating—the pandas.concat() function concatenates the objects along an axis.

#Combining—the pandas.DataFrame.combine_first( ) function is a method that allows you to connect overlapped
# data in order to fill in missing values in a data structure by taking data from another structure.

In [7]:
#-------------------------------------Merging--------------------------------------
# The merging operation, which corresponds to the JOIN operation for those who are familiar with SQL, 
# consists of a combination of data through the connection of rows using one or more keys.

In [14]:
df1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],'price': [12.33,11.44,33.21,13.23,33.62]})
df2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],'color': ['white','red','red','black']})

#Carry out the merging applying the merge( ) function to the two DataFrame objects.
pd.merge(df1,df2)

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [15]:
#In this case you used the merge( ) function without specifying any column explicitly. In fact, in most 
# cases you need to decide which is the column on which to base the merging.
# To do this, add the on option with the column name as the key for the merging.
pd.merge(df1,df2,on="id")

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [45]:
df1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                     'color': ['white','red','red','black','green'],
                     'brand': ['OMG','ABC','ABC','POD','POD']})

df2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],'brand': ['OMG','POD','ABC','POD']})

#Now in this case you have two DataFrame having columns with the same name.So if you launch a merging you do not get any results.
pd.merge(df1,df2)

#So it is necessary to explicitly define the criterion of merging that pandas must follow, specifying the 
# name of the key column in the on option.
pd.merge(df1,df2,on="brand")

Unnamed: 0,id_x,color,brand,id_y
0,ball,white,OMG,pencil
1,pencil,red,ABC,ball
2,pen,red,ABC,ball
3,mug,black,POD,pencil
4,mug,black,POD,pen
5,ashtray,green,POD,pencil
6,ashtray,green,POD,pen


In [44]:
#Often, however, the opposite problem arises, that is, to have two DataFrames in which the key columns 
# do not have the same name. To remedy this situation, you have to use the left_on and right_on options that 
# specify the key column for the first and for the second DataFrame.

# pd.merge(df1,df2,left_on="id",right_on="sid")

Unnamed: 0,id,color,brand_x,sid,brand_y
0,ball,white,OMG,ball,ABC
1,pencil,red,ABC,pencil,OMG
2,pencil,red,ABC,pencil,POD
3,pen,red,ABC,pen,POD


In [49]:
#By default, the merge( ) function performs an inner join; the keys in the result are the result of an intersection.
#Other possible options are the left join, the right join, and the outer join. The outer join produces the 
# union of all keys, combining the effect of a left join with a right join. To select the type of join you have to use 
# the how option.
pd.merge(df1,df2,on="id",how="left")

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


In [51]:
#---------------------Merging on Index-------------------------
# In some cases, instead of considering the columns of a DataFrame as keys, the indexes could be used as 
# keys on which to make the criteria for merging. Then in order to decide which indexes to consider, set the 
# left_index or right_index options to True to activate them, with the ability to activate them both.

In [52]:
pd.merge(df1,df2,left_index=True,right_index=True)

Unnamed: 0,id_x,color,brand_x,id_y,brand_y
0,ball,white,OMG,pencil,OMG
1,pencil,red,ABC,pencil,POD
2,pen,red,ABC,ball,ABC
3,mug,black,POD,pen,POD


In [54]:
#But the DataFrame objects have a join() function which is much more convenient when you want to do the merging by indexes.
#Note
#It can also be used to combine many DataFrame objects having the same or the same indexes but with columns not overlapping.
df1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                     'color': ['white','red','red','black','green'],
                     'brand': ['OMG','ABC','ABC','POD','POD']})

df2 = pd.DataFrame( {'id2':['pencil','pencil','ball','pen'],'brand2': ['OMG','POD','ABC','POD']})
df1.join(df2)

Unnamed: 0,id,color,brand,id2,brand2
0,ball,white,OMG,pencil,OMG
1,pencil,red,ABC,pencil,POD
2,pen,red,ABC,ball,ABC
3,mug,black,POD,pen,POD
4,ashtray,green,POD,,


In [59]:
#----------------------- Concatenating---------------------------
# arr1 = np.zeros((3,3))
# arr2 = np.ones((3,3))
#Horizontal concat
# np.hstack((arr1,arr2))
# np.concatenate([arr1,arr2],axis=1)

#Vertical concat
# np.vstack((arr1,arr2))
# np.concatenate([arr1,arr2],axis=0)

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [61]:
# As regards the pandas library and its data structures like Series and DataFrame, the fact of having 
# labeled axes allows you to further generalize the concatenation of arrays. The concat() function is provided 
# by pandas for this kind of operation.
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])
pd.concat([ser1,ser2])
#By default, the concat() function works on axis = 0, having as returned object a Series. If you set the 
# axis = 1, then the result will be a DataFrame.
pd.concat([ser1,ser2],axis=1)

Unnamed: 0,0,1
1,0.11887,
2,0.363299,
3,0.055001,
4,0.141797,
5,,0.946396
6,,0.4396
7,,0.086734
8,,0.585958


In [67]:
#From the result you can see that there is no overlap of data, therefore what you have just done is an 
# outer join. This can be changed by setting the join option to ‘inner’.
pd.concat([ser1,ser2],axis=1,join='inner')

Unnamed: 0,0,1


In [71]:
#A problem in this kind of operation is that the concatenated parts are not identifiable in the result. For 
# example, you want to create a hierarchical index on the axis of concatenation. To do this you have to use the keys option.
pd.concat([ser1,ser2],keys=[1,2])
#In the case of combinations between Series along the axis = 1 the keys become the column headers of the DataFrame.
# pd.concat([ser1,ser2],axis=1,keys=[1,2])

1  1    0.118870
   2    0.363299
   3    0.055001
   4    0.141797
2  5    0.946396
   6    0.439600
   7    0.086734
   8    0.585958
dtype: float64

In [79]:
#So far you have seen the concatenation applied to the Series, but the same logic can be applied to the DataFrame.
df1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3],columns=['A','B','C'])
df2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6],columns=['A','B','C'])
pd.concat([df1,df2],keys=[1,2])
pd.concat([df1,df2],axis=1,keys=[1,2])

Unnamed: 0_level_0,1,1,1,2,2,2
Unnamed: 0_level_1,A,B,C,A,B,C
1,0.838015,0.188206,0.967874,,,
2,0.064729,0.07402,0.15973,,,
3,0.41832,0.495272,0.965704,,,
4,,,,0.28861,0.208123,0.440579
5,,,,0.296753,0.701392,0.901718
6,,,,0.76775,0.462325,0.172295


In [81]:
#---------------------------Combining------------------------------------
# Take the case in which you want the two datasets to have indexes that overlap in their entirety or at least partially.

In [82]:
ser1 = pd.Series(np.random.rand(5),index=[1,2,3,4,5])
ser2 = pd.Series(np.random.rand(4),index=[2,4,5,6])
print(ser1)
print(ser2)

1    0.065931
2    0.545201
3    0.337924
4    0.730333
5    0.657621
dtype: float64
2    0.186057
4    0.020596
5    0.244903
6    0.569636
dtype: float64


In [83]:
ser1.combine_first(ser2)

1    0.065931
2    0.545201
3    0.337924
4    0.730333
5    0.657621
6    0.569636
dtype: float64

In [84]:
ser2.combine_first(ser1)

1    0.065931
2    0.186057
3    0.337924
4    0.020596
5    0.244903
6    0.569636
dtype: float64

In [85]:
#Instead, if you want a partial overlap, you can specify only the portion of the Series you want to overlap.
ser1[:3].combine_first(ser2[:3])

1    0.065931
2    0.545201
3    0.337924
4    0.020596
5    0.244903
dtype: float64

In [86]:
#---------------------------Pivoting---------------------------------------
#Arrangement of the values by row or by column is not always suited to your goals. 
# Sometimes you would like to rearrange the data carrying column values on rows or vice versa.

In [89]:
#-----Pivoting with Hierarchical Indexing
#stacking: rotates or pivots the data structure converting columns to rows
#unstacking: converts rows into columns
df = pd.DataFrame(np.arange(9).reshape(3,3),index=['white','black','red'],columns=['ball','pen','pencil'])
df.stack().unstack()

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [94]:
#You can also do the unstack on a different level, specifying the number of levels or its name as the argument of the function.
#ser5.unstack(0)

In [98]:
#---------------------Pivoting from “Long” to “Wide” Format---------------------------
longdf = pd.DataFrame({ 'color':['white','white','white','red','red','red','black','black','black'],
                          'item':['ball','pen','mug','ball','pen','mug','ball','pen','mug'],'value': np.random.rand(9)})
longdf
#Instead of the long format, there is another way to arrange the data in a table that is called wide.

Unnamed: 0,color,item,value
0,white,ball,0.354224
1,white,pen,0.003888
2,white,mug,0.875945
3,red,ball,0.111473
4,red,pen,0.16463
5,red,mug,0.685881
6,black,ball,0.783497
7,black,pen,0.647339
8,black,mug,0.36212


In [99]:
# In this regard, pandas gives you a function that allows you to make a transformation of a DataFrame 
# from the long type to the wide type. This function is pivot() and it accepts as arguments the column, or 
# columns, which will assume the role of key.
widedf = longdf.pivot("color","item")
widedf
#As you can now see, in this format, the DataFrame is much more compact and data contained in it are much more readable.

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.783497,0.36212,0.647339
red,0.111473,0.685881,0.16463
white,0.354224,0.875945,0.003888


In [100]:
#-----------------------Removing----------------------

In [106]:
df = pd.DataFrame(np.arange(9).reshape(3,3),index=['white','black','red'],columns=['ball','pen','pencil'])
#In order to remove a column, you have to simply use the del command applied to the DataFrame with the column name specified.
del df["pen"]
#remove an unwanted row, you have to use the drop() function with the label of the corresponding index as argument.
df.drop("white")

Unnamed: 0,ball,pencil
black,3,5
red,6,8


In [3]:
#-------------------------Data Transformation------------------------------------

In [4]:
#---------Removing Duplicates---------------

In [15]:
df = pd.DataFrame({ 'color': ['white','white','red','red','white'],'value': [2,1,3,3,2]})
df
#The duplicated() function applied to a DataFrame can detect the rows which appear to be duplicated. 
# It returns a Series of Booleans where each element corresponds to a row, with True if the row is duplicated 
# (i.e., only the other occurrences, not the first), and with False if there are no duplicates in the previous elements.
df[df.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


In [21]:
#Generally, all duplicated rows are to be deleted from the DataFrame; to do that, pandas provides the 
# drop_duplicates() function, which returns the DataFrame without duplicate rows.
df.drop_duplicates()

color  value
red    3        1
white  1        1
       2        1
dtype: int64

In [22]:
#----------Mapping----------------------
#The mapping is nothing more than the creation of a list of matches between two 
# different values, with the ability to bind a value to a particular label or string.
# replace(): replaces values
# map(): creates a new column
# rename(): replaces the index values

array(['white', 'white', 'red', 'red', 'white'], dtype=object)

In [29]:
#----Replacing Values via Mapping
df = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                   'color':['white','rosso','verde','black','yellow'],
                   'price':[5.56,4.20,1.30,0.56,2.75]})
# To be able to replace the incorrect values in new values is necessary to define a mapping of 
# correspondences, containing as key to replace the old values and values as the new ones.
#dict = {old_value : new_value} -> df.replace(dict)
newcolors = {"rosso":"red","verde":"green"}
#Now the only thing you can do is to use the replace() function with the mapping as an argument.
df.replace(newcolors)

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [38]:
# A common case, for example, is the replacement of the NaN values with another value, for example 0.
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
# replace_NaN = {np.NaN:0}
ser.replace(np.NaN,0)

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

In [39]:
#-----------Adding Values via Mapping--------------
#The mapping will always be defined separately.

In [40]:
df = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],'color':['white','red','green','black','yellow']})
df

Unnamed: 0,item,color
0,ball,white
1,mug,red
2,pen,green
3,pencil,black
4,ashtray,yellow


In [42]:
# Let’s suppose you want to add a column to indicate the price of the item shown in the DataFrame. 
# Before you do this, it is assumed that you have a price list available somewhere, in which the price for each 
# type of item is described. Define then a dict object that contains a list of prices for each type of item.

price = {'ball' : 5.56,'mug' : 4.20,'bottle' : 1.30,'scissors' : 3.41,'pen' : 1.30,'pencil' : 0.56,'ashtray' : 2.75}

# The map() function applied to a Series or to a column of a DataFrame accepts a function or an object 
# containing a dict with mapping. So in your case you can apply the mapping of the prices on the column item, 
# making sure to add a column to the price data frame.
df["price"] = df.item.map(price)
df

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [45]:
df = pd.DataFrame(np.array([random.randint(11,21) for i in range(10)]).reshape(5,2),columns=["x","y"])
def f(x):
    return x*2
df["y*2"] = df.y.map(f)
df

Unnamed: 0,x,y,y*2
0,20,19,38
1,15,11,22
2,17,21,42
3,18,13,26
4,14,12,24


In [53]:
#-------------Rename the Indexes of the Axes-----------------
#To replace the label indexes,pandas provides the rename() function,which takes the mapping as argument, that is, a dict object.
df = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                   'color':['white','red','green','black','yellow'],
                   'price':[5.56,4.20,1.30,0.56,2.75]})
reindex = {0: 'first',1: 'second',2: 'third',3: 'fourth',4: 'fifth'}
recolumn = {'item':'object','price': 'value'}
df.rename(reindex)
#As you can see, by default, the indexes are renamed. If you want to rename columns you must use 
# the columns option. Thus this time you assign various mapping explicitly to the two index and columns options.
df.rename(index=reindex,columns=recolumn)

Unnamed: 0,object,color,value
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [54]:
# So far you have seen that the rename() function returns a DataFrame with the changes, leaving unchanged the original DataFrame.
# If you want the changes to take effect on the object on which you call the function, you will set the inplace option to True.
df.rename(index=reindex,columns=recolumn,inplace=True)

In [62]:
#---------------------Discretization and Binning----------------------
#In applied mathematics, discretization is the process of transferring continuous functions, models, variables, 
# and equations into discrete counterparts.

#Binning in data mining is a data preprocessing technique that involves grouping data into smaller,more manageable categories
# or bins. 

array([10, 12, 14, 16, 18, 20])

In [105]:
results = [34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]
# You know that the experimental values have a range from 0 to 100; therefore you can uniformly divide 
# this interval, for example, into four equal parts, i.e., bins. The first contains the values between 0 and 25, the 
# second between 26 and 50, the third between 51 and 75, and the last between 76 and 100.

#To do this binning with pandas, first you have to define an array containing the values of separation of bin.

bins = [0,25,50,75,100]
#Then there is a special function called cut() and apply it to the array of results also passing the bins
cat = pd.cut(results,bins)
cat

[(25, 50], (50, 75], (50, 75], (25, 50], (75, 100], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 16
Categories (4, interval[int64, right]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [106]:
#The object returned by the cut() function is a special object of Categorical type. You can consider it as 
# an array of strings indicating the name of the bin. Internally it contains a levels array indicating the names 
# of the different internal categories and a labels array that contains a list of numbers equal to the elements of 
# results (i.e., the array subjected to binning). The number corresponds to the bin to which the corresponding 
# element of results is assigned.

# cat.levels
# cat.labels

#Finally to know the occurrences for each bin, that is, how many results fall into each category, you have 
# to use the value_counts() function.
pd.value_counts(cat)


(75, 100]    5
(25, 50]     4
(50, 75]     4
(0, 25]      3
dtype: int64

In [107]:
#You can give names to various bins by calling them first in an array of strings and then assigning to the 
# labels options inside the cut() function that you have used to create the Categorical object.
bin_names = ['unlikely','less likely','likely','highly likely']
cat = pd.cut(results,bins,labels=bin_names)
cat

['less likely', 'likely', 'likely', 'less likely', 'highly likely', ..., 'highly likely', 'unlikely', 'less likely', 'highly likely', 'highly likely']
Length: 16
Categories (4, object): ['unlikely' < 'less likely' < 'likely' < 'highly likely']

In [108]:
#If the cut() function is passed as an argument to an integer instead of explicating the bin edges, this will 
# divide the range of values of the array in many intervals as specified by the number.
cat = pd.cut(results,4)
cat
#The limits of the interval will be taken by the minimum and maximum of the sample data, namely, the array subjected to binning.

[(27.0, 51.0], (51.0, 75.0], (51.0, 75.0], (27.0, 51.0], (75.0, 99.0], ..., (75.0, 99.0], (2.904, 27.0], (27.0, 51.0], (75.0, 99.0], (75.0, 99.0]]
Length: 16
Categories (4, interval[float64, right]): [(2.904, 27.0] < (27.0, 51.0] < (51.0, 75.0] < (75.0, 99.0]]

In [109]:
#In addition to cut(), pandas provides another method for binning: qcut(). This function divides the 
# sample directly into quintiles. In fact, depending on the distribution of the data sample, using cut() rightly 
# you will have a different number of occurrences for each bin. Instead qcut() will ensure that the number of 
# occurrences for each bin is equal, but the edges of each bin to vary.

cat = pd.qcut(results,4)
cat

[(32.5, 55.5], (55.5, 87.0], (32.5, 55.5], (2.999, 32.5], (87.0, 99.0], ..., (55.5, 87.0], (2.999, 32.5], (32.5, 55.5], (87.0, 99.0], (55.5, 87.0]]
Length: 16
Categories (4, interval[float64, right]): [(2.999, 32.5] < (32.5, 55.5] < (55.5, 87.0] < (87.0, 99.0]]

In [110]:
pd.value_counts(cat)

(55.5, 87.0]     5
(2.999, 32.5]    4
(32.5, 55.5]     4
(87.0, 99.0]     3
dtype: int64

In [111]:
#-------------------Detecting and Filtering Outliers-------------------------

In [119]:
df = pd.DataFrame(np.random.randn(1000,3))
#With the describe() function you can see the statistics for each column.
df.describe()

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,-0.027058,0.004905,0.012357
std,1.014259,0.971072,1.002523
min,-4.276392,-3.206674,-3.65662
25%,-0.694336,-0.656155,-0.671517
50%,-0.019211,-0.027414,-0.011935
75%,0.702971,0.634252,0.675814
max,2.964191,3.26661,3.20054


In [126]:
#Now you apply the filtering of all the values of the DataFrame, applying the corresponding standard 
# deviation for each column. Thanks to the any() function, you can apply the filter on each column.
df[(np.abs(df) > (3*df.std())).any(axis=1)]

Unnamed: 0,0,1,2
15,-4.276392,0.151476,-0.379446
39,0.671356,-0.515858,-3.160311
91,-0.151103,-0.78007,-3.078215
166,-0.326163,3.26661,2.385071
298,0.039791,-3.206674,0.226296
374,-0.428442,3.011424,0.287412
429,-0.914424,0.404599,-3.65662
608,-4.095649,-0.610908,0.227338
999,-0.699441,-0.487413,3.20054


In [127]:
#---------------------------Permutation--------------------------
#The operations of permutation (random reordering) of a Series or the rows of a DataFrame are easy to do 
# using the numpy.random.permutation() function.

In [3]:
df = pd.DataFrame(np.arange(25).reshape(5,5))

#Now create an array of five integers from 0 to 4 arranged in random order with the permutation()
# function. This will be the new order in which to set the values of a row of DataFrame.
new_order = np.random.permutation(5)
print(new_order)
df

[3 4 1 0 2]


Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [4]:
#Now apply it to the DataFrame on all lines, using the take() function.
df.take(new_order,axis=1)
#As you can see, the order of the rows has been changed; now the indices follow the same order as 
# indicated in the new_order array.

Unnamed: 0,3,4,1,0,2
0,3,4,1,0,2
1,8,9,6,5,7
2,13,14,11,10,12
3,18,19,16,15,17
4,23,24,21,20,22


In [138]:
#You can submit even a portion of the entire DataFrame to a permutation. It generates an array that has a 
# sequence limited to a certain range, for example, in our case from 2 to 4.
new_order = np.random.permutation([2,3,4])
new_order

array([3, 4, 2])

In [139]:
df.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


In [140]:
#--------------------------Random Sampling---------------------

In [152]:
np.random.randint(0,100,size=5)
#As you can see from this random sampling you can get the same sample even more times.

array([71, 15, 71, 73, 94])

In [153]:
#----------------------String Manipulation---------------------

In [161]:
#Built-in Methods for Manipulation of Strings
#The split() function allows us to separate parts of a text, taking as a reference point a separator, for example a comma.
name1 = "Manish,Rai".split(",")
name2 = "Avinash,Rai".split(",")
name3 = "Vikas,Rai".split(",")

df = pd.DataFrame(np.row_stack([name1,name2,name3]),columns=["First_name","last_name"])
df

Unnamed: 0,First_name,last_name
0,Manish,Rai
1,Avinash,Rai
2,Vikas,Rai


In [166]:
#As we can see in the first element, you have a string with a space character at the end. To overcome this 
# problem and often a frequent problem, you have to use the split() function along with the strip() function 
# that takes care of doing the trim of whitespace (including newlines).
name1 = "Manish,Rai".split(",")
#If the parts to be concatenated are much more, a more practical approach in this case will be to use the join() function 
# assigned to the separator character, with which you want to join the various strings between them.
",".join(name1)
# Another category of operations that can be performed on the string is the search for pieces of text 
# in them, i.e., substrings.
"Manish" in name1

#However, there are two functions that could serve to this purpose: index() and find().
"Manish Rai".index("Manish")
"Manish Rai".find('Manish')
#In fact, the index() function returns an error message, and find() returns -1 if the substring is not found.

#In the same area, you can know how many times a character or combination of characters (substring) occurs 
# within a text. The count() function provides you with this number.
"Manish Rai".count("a")

#Another operation that can be performed on strings is the replacement or elimination of a substring 
# (or a single character). In both cases you will use the replace() function, where if you are prompted 
# to replace a substring with a blank character, the operation will be equivalent to the elimination of the 
# substring from the text.

2

In [167]:
#-----------------------Regular Expressions---------------------

In [3]:
#The regular expressions provide a very flexible way to search and match string patterns within a text. A single 
# expression, generically called regex, is a string formed according to the regular expression language. There 
# is a built-in Python module called re, which is responsible for the operation of the regex.

#The re module provides a set of functions that can be divided into three different categories:
# pattern matching
# substitution
# splitting

In [4]:
#There is a split() function even for the re module that performs the same operations, only it is able to accept
# a regex pattern as the criterion of separation, which makes it considerably more flexible.
text = "This is an\t odd \n text!"
re.split("\s+",text)

['This', 'is', 'an', 'odd', 'text!']

In [6]:
#You can compile the regex function with the re.compile() function, thus obtaining a reusable object regex and 
# so gaining in terms of CPU cycles.
regex = re.compile("\s+")
re.split(regex,text)

['This', 'is', 'an', 'odd', 'text!']

In [7]:
#So if you make an regex object with the compile() function, you can apply split() directly to it in the following way.
regex.split(text)

['This', 'is', 'an', 'odd', 'text!']

In [8]:
#As regards matching a regex pattern with any other business substrings in the text, you can use the 
# findall() function. It returns a list of all the substrings in the text that meet the requirements of the regex.
text = 'This is my address: 16 Bolton Avenue, Boston'
regex = re.compile("[A,a]\w+")
regex.findall(text)

['address', 'Avenue']

In [15]:
#While findall()returns all matches within a list, the function search() returns only the first match. Furthermore, the object 
# returned by this function is a particular object:
search = regex.search(text)
#This object does not contain the value of the substring that responds to the regex pattern, but its start 
# and end positions within the string.
search.start()
search.end()
text[search.start():search.end()]

'address'

In [17]:
#The match() function performs the matching only at the beginning of the string; if there is no match with the first character,
# it goes no further in research within the string. If you do not find any match then it will not return any objects.
re.match('[A,a]\w+',text)
#If match() has a response, then it returns an object identical to what you saw for the search() function.
match = re.match("T\w+",text)
text[match.start():match.end()]

'This'

In [18]:
#-----------------------Data Aggregation-------------------------

In [19]:
#The last stage of data manipulation is data aggregation. For data aggregation you generally mean a 
# transformation that produces a single integer from an array.
#sum(), mean(), count().

In [25]:
#GroupBy
#if you are working with two-dimensional objects such as the DataFrame, the grouping criterion may be applied both to the line 
# (axis = 0) for that column (axis = 1).
df = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                   'object': ['pen','pencil','pencil','ashtray','pen'],
                   'price1' : [5.56,4.20,1.30,0.56,2.75],
                   'price2' : [4.75,4.12,1.60,0.75,3.15]})
df

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [29]:
group = df.price1.groupby(df.color)
#The object that we got is a GroupBy object.
#To analyze in detail how the division into groups of rows of DataFrame was made, you call the attribute groups GroupBy object.
group.groups
#calculation part
group.mean()
group.sum()

color
green    4.05
red      4.76
white    5.56
Name: price1, dtype: float64

In [34]:
#------------------Hierarchical Grouping-----------------------------
#How to group the data according to the values of a column as a key choice. The same thing can be extended to multiple columns,
# i.e., make a grouping of multiple keys hierarchical.
ggroup = df['price1'].groupby([df['color'],df['object']])
ggroup.groups

{('green', 'pen'): [4], ('green', 'pencil'): [2], ('red', 'ashtray'): [3], ('red', 'pencil'): [1], ('white', 'pen'): [0]}

In [41]:
ggroup = df.price1.groupby([df.color,df.object])
df

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [42]:
ggroup.sum()
#So far you have applied the grouping to a single column of data, but in reality it can be extended to 
# multiple columns or the entire data frame.

color  object 
green  pen        2.75
       pencil     1.30
red    ashtray    0.56
       pencil     4.20
white  pen        5.56
Name: price1, dtype: float64

In [44]:
#if you do not need to reuse the object GroupBy several times, it is convenient to combine in a single passing 
# all of the grouping and calculation to be done, without defining any intermediate variable.
group = df[["price1","price2"]].groupby(df.color)
group.mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [48]:
#Exercise
salaries.head(2)


Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000


In [53]:
group = salaries["salary"].groupby(salaries["rank"])
group.mean()

rank
AssocProf     91786.230769
AsstProf      81362.789474
Prof         123624.804348
Name: salary, dtype: float64

In [57]:
group = mtcars["mpg"].groupby(mtcars["cyl"])
group.groups
group.std()

cyl
4    4.509828
6    1.453567
8    2.560048
Name: mpg, dtype: float64

In [58]:
#------------------------------Group Iteration-----------------------------

In [59]:
#The GroupBy object supports the operation of an iteration for generating a sequence of 2-tuples containing 
# the name of the group together with the data portion.

In [61]:
df = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                   'object': ['pen','pencil','pencil','ashtray','pen'],
                   'price1' : [5.56,4.20,1.30,0.56,2.75],
                   'price2' : [4.75,4.12,1.60,0.75,3.15]})
df

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [64]:
for name,group in df.groupby(df["color"]):
    print(group)
    print(name)    

   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.75    3.15
green
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
red
   color object  price1  price2
0  white    pen    5.56    4.75
white


In [72]:
#-----------------------Chain of Transformations----------------------------
#for each grouping, when subjected to some function calculation or other operations in general, regardless of how
# it was obtained and the selection criteria, the result will be a data structure Series (if we selected a single column data)
# or DataFrame, which then retains the index system and the name of the columns.

#For single column
results1 = df["price1"].groupby(df["color"]).mean()
results1

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [73]:
#For DataFrame
results2 = df.groupby(df["color"]).mean()
results2

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [76]:
#So it is possible to select a single column at any point in the various phases of this process. Here are 
# three cases in which the selection of a single column in three different stages of the process applies.
df["price1"].groupby(df["color"]).mean()
df.groupby(df["color"])["price1"].mean()
(df.groupby(df["color"]).mean())["price1"]

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [82]:
#Exercise
titanic["Age"].groupby([titanic["Survived"],titanic["Sex"]]).min().unstack()

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.0,1.0
1,0.75,0.42


In [84]:
#Add prefix to column names
#In addition, after an operation of aggregation the names of some columns may not be very meaningful in certain cases.
# In fact it is often useful to add a prefix to the column name that describes the type of business combination.
df.groupby(df["color"]).mean().add_prefix("mean_")

Unnamed: 0_level_0,mean_price1,mean_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [85]:
#--------------------Functions on Groups-----------------

In [89]:
df.groupby(df["color"])['price1'].quantile()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [90]:
#You can also define their own aggregation functions. Define the function separately and then you 
# pass as an argument to the agg() function.
def range(Series):
    return Series.max() - Series.min()
df.groupby(df["color"])['price1'].agg(range)

#The agg() function() allows you to use aggregate functions on an entire DataFrame.

color
green    1.45
red      3.64
white    0.00
Name: price1, dtype: float64

In [91]:
#Also you can use more aggregate functions at the same time always with the mark() function passing an 
# array containing the list of operations to be done, which will become the new columns.
df.groupby(df["color"])['price1'].agg(["mean","std",range])

Unnamed: 0_level_0,mean,std,range
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
green,2.025,1.025305,1.45
red,2.38,2.573869,3.64
white,5.56,,0.0


In [93]:
#---------------------Advanced Data Aggregation-------------------------
# In this section you will be introduced to transform() and apply() functions, which will allow you to perform 
# many kinds of group operations, some very complex.
df = pd.DataFrame({ 'color':['white','red','green','red','green'],
                   'price1':[5.56,4.20,1.30,0.56,2.75],
                   'price2':[4.75,4.12,1.60,0.75,3.15]})
df

Unnamed: 0,color,price1,price2
0,white,5.56,4.75
1,red,4.2,4.12
2,green,1.3,1.6
3,red,0.56,0.75
4,green,2.75,3.15


In [99]:
#we want to bring together in the same DataFrame the following: (i) the DataFrame of origin (the one containing the data)
# and (ii) that obtained by the calculation of group aggregation, for example, the sum.
sums = df.groupby("color").sum().rename(columns = {"price1":"total_price1","price2":"total_price2"})

In [102]:
#So thanks to the merge(), you managed to add the results of a calculation of aggregation in each line 
# of the data frame to start.
pd.merge(df,sums,on="color")

Unnamed: 0,color,price1,price2,total_price1,total_price2
0,white,5.56,4.75,5.56,4.75
1,red,4.2,4.12,4.76,4.87
2,red,0.56,0.75,4.76,4.87
3,green,1.3,1.6,4.05,4.75
4,green,2.75,3.15,4.05,4.75


In [107]:
#----------Transform()------------------------

#But actually there is another way to do this type of operation. That is by using the 
# transform(). This function performs the calculation of aggregation as you have seen before, but at the same 
# time shows the values calculated based on the key value on each line of the data frame to start.
sums =df.groupby(df.color).transform(np.sum).add_prefix("tot_")

#As you can see the transform() method is a more specialized function that has very specific 
# requirements: the function passed as an argument must produce a single scalar value (aggregation) to be broadcasted.

In [108]:
df.join(sums)

Unnamed: 0,color,price1,price2,tot_price1,tot_price2
0,white,5.56,4.75,5.56,4.75
1,red,4.2,4.12,4.76,4.87
2,green,1.3,1.6,4.05,4.75
3,red,0.56,0.75,4.76,4.87
4,green,2.75,3.15,4.05,4.75


In [109]:
#---------------apply()-------------------------
df = pd.DataFrame( { 'color':['white','black','white','white','black','black'],
                    'status':['up','up','down','down','down','up'],
                    'value1':[12.33,14.55,22.34,27.84,23.40,18.33],
                    'value2':[11.23,31.80,29.99,31.18,18.25,22.44]})
df

Unnamed: 0,color,status,value1,value2
0,white,up,12.33,11.23
1,black,up,14.55,31.8
2,white,down,22.34,29.99
3,white,down,27.84,31.18
4,black,down,23.4,18.25
5,black,up,18.33,22.44


In [112]:
df.groupby(["color","status"]).apply(lambda x: x.max())

Unnamed: 0_level_0,Unnamed: 1_level_0,color,status,value1,value2
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
black,down,black,down,23.4,18.25
black,up,black,up,18.33,31.8
white,down,white,down,27.84,31.18
white,up,white,up,12.33,11.23
