#### Pandas is a module to basically manipulate data

In [1]:
# Pandas
import pandas as pd

In [5]:
iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
iris
# our data had 150 rows but for some reason it had taken the first row as a column header as we don't have a header in the official file
print(type(iris))

# Dataframe is a data type for 2D data
# pandas has other data types for other dimensional data

<class 'pandas.core.frame.DataFrame'>


In [209]:
# Accessing Data in Pandas
# doing df = iris makes both df and iris point to the same object, hence any changes done by df will get reflected in iris
df = iris.copy()    # copies the data of iris into df(dataframe)

In [210]:
df.head(3)     # it gives an idea of how the data looks like with just a few entries we can pass an integer as to how many rows we want to see

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa


In [211]:
# print(df.columns)    # to see what columns we are using
df.columns = ['sl','sw','pl','pw','flower_type']
print(df.columns)

Index(['sl', 'sw', 'pl', 'pw', 'flower_type'], dtype='object')


In [109]:
print(df.shape)
print(df.dtypes)

(149, 5)
sl             float64
sw             float64
pl             float64
pw             float64
flower_type     object
dtype: object


In [110]:
df.describe()
# it describes the data for us. if the data has int or float attributes then it describes it for us with the values like
# count, mean, std etc. That's why here instead of flower_type all the columns got described

# if the data has NaN then the count will decrease as count only shows the valid number of entries for the rows

# min and max values are also there and the spread between them shown by 25 percentile, 50 percentile, 75 percentile
# 25 percentile means what is the value which has 25% of the entries less than it

Unnamed: 0,sl,sw,pl,pw
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [111]:
df.sl      # df.column_name used to access that particular column
df["sl"]   # another way to acces

0      4.9
1      4.7
2      4.6
3      5.0
4      5.4
5      4.6
6      5.0
7      4.4
8      4.9
9      5.4
10     4.8
11     4.8
12     4.3
13     5.8
14     5.7
15     5.4
16     5.1
17     5.7
18     5.1
19     5.4
20     5.1
21     4.6
22     5.1
23     4.8
24     5.0
25     5.0
26     5.2
27     5.2
28     4.7
29     4.8
      ... 
119    6.9
120    5.6
121    7.7
122    6.3
123    6.7
124    7.2
125    6.2
126    6.1
127    6.4
128    7.2
129    7.4
130    7.9
131    6.4
132    6.3
133    6.1
134    7.7
135    6.3
136    6.4
137    6.0
138    6.9
139    6.7
140    6.9
141    5.8
142    6.8
143    6.7
144    6.7
145    6.3
146    6.5
147    6.2
148    5.9
Name: sl, Length: 149, dtype: float64

In [112]:
df.isnull()        # shows the null entries. if null then shows true, false accordingly
# a better way to look at how many null entries are there would be
df.isnull().sum()

sl             0
sw             0
pl             0
pw             0
flower_type    0
dtype: int64

In [113]:
df.iloc[1:4, 2:4]    # to access data from somewhere in between

Unnamed: 0,pl,pw
1,1.3,0.2
2,1.5,0.2
3,1.4,0.2


In [114]:
# Manipulating Data in Data Frame

# drop by label
a = df.drop(0)    # it doesn't do it in-place, it makes a copy of it without the 0th row
df.head()
a.head()          # so a is without the 0th row
# df = df.drop(2)
df.drop(0,inplace = True)       # this drops the oth row out of df
df.head()

# WE ARE NOT SAYING THAT REMOVE THE 0th ROW. WE'RE SAYING REMOVE THE ROW WHICH HAVE THE LABEL AS 0. SO THE WHOLE 0TH ROW GETS
# DROPPED DOWN WITH THE LABEL

Unnamed: 0,sl,sw,pl,pw,flower_type
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa


In [115]:
df.index    # uesd to look at the lables i.e. row index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            139, 140, 141, 142, 143, 144, 145, 146, 147, 148],
           dtype='int64', length=148)

In [116]:
df.index[0], df.index[3]

(1, 4)

In [127]:
# drop by index
print(df.index[0])
df.drop(df.index[0],inplace = True)      # to remove the row present at 0th index. label also gets dropped
df.drop(df.index[[0,1]],inplace = True)   # if we want to emove more than one row then pass indexes of that row in an array
df.head()

12


Unnamed: 0,sl,sw,pl,pw,flower_type
15,5.4,3.9,1.3,0.4,Iris-setosa
16,5.1,3.5,1.4,0.3,Iris-setosa
17,5.7,3.8,1.7,0.3,Iris-setosa
18,5.1,3.8,1.5,0.3,Iris-setosa
19,5.4,3.4,1.7,0.2,Iris-setosa


In [134]:
df.sl > 5      # tells us how many rows have this condition as True and how many of the rows have it as False

15      True
16      True
17      True
18      True
19      True
20      True
21     False
22      True
23     False
24     False
25     False
26      True
27      True
28     False
29     False
30      True
31      True
32      True
33     False
34     False
35      True
36     False
37     False
38      True
39     False
40     False
41     False
42     False
43      True
44     False
       ...  
119     True
120     True
121     True
122     True
123     True
124     True
125     True
126     True
127     True
128     True
129     True
130     True
131     True
132     True
133     True
134     True
135     True
136     True
137     True
138     True
139     True
140     True
141     True
142     True
143     True
144     True
145     True
146     True
147     True
148     True
Name: sl, Length: 134, dtype: bool

In [137]:
df[df.sl > 5]   # it gives us only those entries in df whose condition is True. the table will print only those rows with conditiong being True

Unnamed: 0,sl,sw,pl,pw,flower_type
15,5.4,3.9,1.3,0.4,Iris-setosa
16,5.1,3.5,1.4,0.3,Iris-setosa
17,5.7,3.8,1.7,0.3,Iris-setosa
18,5.1,3.8,1.5,0.3,Iris-setosa
19,5.4,3.4,1.7,0.2,Iris-setosa
20,5.1,3.7,1.5,0.4,Iris-setosa
22,5.1,3.3,1.7,0.5,Iris-setosa
26,5.2,3.5,1.5,0.2,Iris-setosa
27,5.2,3.4,1.4,0.2,Iris-setosa
30,5.4,3.4,1.5,0.4,Iris-setosa


In [138]:
df[df.flower_type == 'Iris-setosa']    # gives only those entries for which flower_type is Iris-setosa

Unnamed: 0,sl,sw,pl,pw,flower_type
15,5.4,3.9,1.3,0.4,Iris-setosa
16,5.1,3.5,1.4,0.3,Iris-setosa
17,5.7,3.8,1.7,0.3,Iris-setosa
18,5.1,3.8,1.5,0.3,Iris-setosa
19,5.4,3.4,1.7,0.2,Iris-setosa
20,5.1,3.7,1.5,0.4,Iris-setosa
21,4.6,3.6,1.0,0.2,Iris-setosa
22,5.1,3.3,1.7,0.5,Iris-setosa
23,4.8,3.4,1.9,0.2,Iris-setosa
24,5.0,3.0,1.6,0.2,Iris-setosa


In [142]:
df[df.flower_type == 'Iris-setosa'].describe()   # describes data only for 'Iris-setosa' flower type

Unnamed: 0,sl,sw,pl,pw
count,34.0,34.0,34.0,34.0
mean,5.026471,3.420588,1.482353,0.258824
std,0.313638,0.365795,0.184999,0.113131
min,4.4,2.3,1.0,0.1
25%,4.825,3.2,1.4,0.2
50%,5.05,3.4,1.5,0.2
75%,5.2,3.675,1.6,0.3
max,5.7,4.2,1.9,0.6


In [148]:
# iloc vs loc
print(df.head())
print(df.iloc[0])      # prints according to the index / position
print(df.loc[15])      # labeled output. it won't print 0th index row because it sees the label and prints that
# here we don't have a 0th label so we are not able to print that

     sl   sw   pl   pw  flower_type
15  5.4  3.9  1.3  0.4  Iris-setosa
16  5.1  3.5  1.4  0.3  Iris-setosa
17  5.7  3.8  1.7  0.3  Iris-setosa
18  5.1  3.8  1.5  0.3  Iris-setosa
19  5.4  3.4  1.7  0.2  Iris-setosa
sl                     5.4
sw                     3.9
pl                     1.3
pw                     0.4
flower_type    Iris-setosa
Name: 15, dtype: object
sl                     5.4
sw                     3.9
pl                     1.3
pw                     0.4
flower_type    Iris-setosa
Name: 15, dtype: object


In [191]:
df.loc[0] = [1,2,3,4,"Iris-setosa"]    # it got added to the tail

In [192]:
df.tail()
# to keep the numbering correct. use the last numbering
df.loc[149] = [1,2,3,4,"Iris-setosa"]
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica
149,1.0,2.0,3.0,4.0,Iris-setosa
0,1.0,2.0,3.0,4.0,Iris-setosa


In [193]:
df.drop(0,inplace = True)
df.tail()
df.index

Int64Index([ 15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
            ...
            140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
           dtype='int64', length=135)

In [194]:
df.reset_index()    # resets index and creates a new column called index consisting of old indices which we don't want

Unnamed: 0,index,sl,sw,pl,pw,flower_type
0,15,5.4,3.9,1.3,0.4,Iris-setosa
1,16,5.1,3.5,1.4,0.3,Iris-setosa
2,17,5.7,3.8,1.7,0.3,Iris-setosa
3,18,5.1,3.8,1.5,0.3,Iris-setosa
4,19,5.4,3.4,1.7,0.2,Iris-setosa
5,20,5.1,3.7,1.5,0.4,Iris-setosa
6,21,4.6,3.6,1,0.2,Iris-setosa
7,22,5.1,3.3,1.7,0.5,Iris-setosa
8,23,4.8,3.4,1.9,0.2,Iris-setosa
9,24,5.0,3.0,1.6,0.2,Iris-setosa


In [212]:
df.reset_index(drop = True,inplace = True)    # resets index and does not add the index column
df.index

RangeIndex(start=0, stop=149, step=1)

In [213]:
# df.drop('sl')    # gives an error an no rows labeled 'sl'
df.drop('sl',axis = 1,inplace = True)  # looks column wise and delete the column named 'sl'
df.head()

Unnamed: 0,sw,pl,pw,flower_type
0,3.0,1.4,0.2,Iris-setosa
1,3.2,1.3,0.2,Iris-setosa
2,3.1,1.5,0.2,Iris-setosa
3,3.6,1.4,0.2,Iris-setosa
4,3.9,1.7,0.4,Iris-setosa


In [214]:
del df['sw']

In [216]:
df.describe()
df.head()

Unnamed: 0,pl,pw,flower_type
0,1.4,0.2,Iris-setosa
1,1.3,0.2,Iris-setosa
2,1.5,0.2,Iris-setosa
3,1.4,0.2,Iris-setosa
4,1.7,0.4,Iris-setosa


In [219]:
df = iris.copy()
df.columns = ['sl','sw','pl','pw','flower_type']
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [223]:
df["diff_pl_pw"] = df["pl"] - df["pw"]    # adds a new column which is the difference of pl and pw
df["abc"] = 1                             # makes the column of all ones
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw,abc
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2,1
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1,1
2,4.6,3.1,1.5,0.2,Iris-setosa,1.3,1
3,5.0,3.6,1.4,0.2,Iris-setosa,1.2,1
4,5.4,3.9,1.7,0.4,Iris-setosa,1.3,1
