In [1]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/ToothGrowth.csv')
df.head()

Unnamed: 0,len,supp,dose
0,4.2,VC,0.5
1,11.5,VC,0.5
2,7.3,VC,0.5
3,5.8,VC,0.5
4,6.4,VC,0.5


##### 3. Series and its relation with dataframe

A Series is a type that is used to store one column only. You can think of a series as one column of a DataFrame extracted.

Series is very similar to a NumPy array with a main difference that it has an index label for each observation.

In [3]:
import pandas as pd
import numpy as np


#### Relationship between series and a DataFrame

If you extract any given column from a Dataframe, the resulting object is a series

In [4]:
df = pd.DataFrame(np.random.randint(1, 100, (5, 4)), columns = list('abcd'))
df

Unnamed: 0,a,b,c,d
0,93,95,71,18
1,23,94,85,85
2,64,7,39,52
3,19,88,28,22
4,49,43,37,22


In [5]:
df['a']

0    93
1    23
2    64
3    19
4    49
Name: a, dtype: int32

In [6]:
type(df['a'])

pandas.core.series.Series

In [15]:
df['a'][0: 3].values.tolist()

[93, 23, 64]

In [12]:
df['a'][0: 3].to_list()

[93, 23, 64]

#### Create a standalone Series Object

In [17]:
data = np.arange(10)
index = ['a','b','c','d','e','f','g','h','i','j']

ser = pd.Series(data = data, name ='numbers') # name is optional.
ser

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [20]:
ser = pd.Series(data=data, index=index, name = 'numbers') # name is optional
ser

a    0
b    1
c    2
d    3
e    4
f    5
g    6
h    7
i    8
j    9
Name: numbers, dtype: int32

###### Series are vectorized by default. Ex: to mulitply every item by 2, you don't have to write a for loop. just multiply the series by 2


In [21]:
ser * 2


a     0
b     2
c     4
d     6
e     8
f    10
g    12
h    14
i    16
j    18
Name: numbers, dtype: int32

Extract an item

In [22]:
ser['b']

1

To extract more than one item, put all the item labels in a list and pass that list as argument.

this wont work. Because Series is one dimensional object and therefore will accept only one argument


In [23]:
# This wont work
# ser['a','b']

## So, pass all arguments in a square bracket.

ser[['a','b']]

a    0
b    1
Name: numbers, dtype: int32

You can extract the index as well.


In [24]:
# method 1
ser.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

In [25]:
# method 2
ser.keys()

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

Also if you simply extract one column from a dataframe, it becomes a series, so you can think of a dataframe as a 'column-wise arrangement of series'.

In [26]:
d1 = {'a':0, 'b':1, 'c':3}
d2 = {'b':0 , 'c':1, 'd':3}

In [27]:
ser1 = pd.Series(d1)
ser2 = pd.Series(d2)
ser1

a    0
b    1
c    3
dtype: int64

In [28]:
ser2

b    0
c    1
d    3
dtype: int64

In [30]:
# if index is not aligned it will be the result like this.

ser1 + ser2

a    NaN
b    1.0
c    4.0
d    NaN
dtype: float64

In [32]:
# to get the desired result, lets convert the series into values then add.

ser1.values + ser2.values

array([0, 2, 6], dtype=int64)

In [34]:
# using add function and fill_value = 0 then nan value replaced with 0

ser1.add(ser2, fill_value=0)

a    0.0
b    1.0
c    4.0
d    3.0
dtype: float64

In [36]:
# recall Inspecting DataFrames 5 of 53 
recall = """
import numpy as np
import pandas as pd

df =pd.read_csv('dataset/chumk.csv')
df

df.shape  # returns the tuple(rows, columns
len(df)  # length / number of rows
df.head(5) # returns the top 5 rows
df.tail(10) # returns bottom 10 rows

df.info()  # dataframe into provides the datatypes, # non null records and memory usage.  dtype = object(str), int64, float64, bool

df.memory_usuage(deep = True) #Know Memory usuage of each columns

df.dtypes  # check only the datatypes

df['chunk'] = df['churk'].astype('int')  # change Boolean to integer datatype for 'churk'
df.info()   # check again.
"""

#### 2. Approaches to Renaming Columns

In [38]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/ToothGrowth.csv')

df

URLError: <urlopen error [Errno 11001] getaddrinfo failed>


### Get column names

df

KeyboardInterrupt: 

False


3
2
1
0


6
