# Series

In [None]:
import numpy as np 
import pandas as pd

## Creating a Series
You can convert a list, numpy array, tuple, or a dictionary in a Series:

In [None]:
labels  = ['a','b','c']
my_list = [10, 20, 30]
arr     = np.array([10, 20, 30])
d       = {'a': 10, 'b': 20, 'c': 30}

**Using Lists**

In [None]:
pd.Series(data = my_list)

0    10
1    20
2    30
dtype: int64

In [None]:
pd.Series(data = arr)

0    10
1    20
2    30
dtype: int64

In [None]:
pd.Series(arr, index= labels)

a    10
b    20
c    30
dtype: int64

In [None]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [None]:
pd.Series([1, 2, 3,4, 5, 6], index = [11, 22, 33, 44, 55, 66], dtype = np.int32, name = 'random series')

11    1
22    2
33    3
44    4
55    5
66    6
Name: random series, dtype: int32

In [None]:
pd.Series([len , all, abs])

0    <built-in function len>
1    <built-in function all>
2    <built-in function abs>
dtype: object

## Not a Number: NaN

Null Values or Missing Data in Pandas

## Using Index

In [None]:
ser1 = pd.Series([1, 2, 3, 4], index = ['USA', 'Germany','USSR','Japan'])
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [None]:
ser2 = pd.Series([1, 2, 5, 4], index = ['USA', 'Germany','Italy','Japan'])
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [None]:
ser2['Germany']

2

In [None]:
ser2[['Germany','Japan']]

Germany    2
Japan      4
dtype: int64

### Aggregate Functions

sum, mean , median , mode, index, name, add, sub, mul, sqrt, div, std, exp, log, cos, sin , tan

In [None]:
ser1.index

Index(['USA', 'Germany', 'USSR', 'Japan'], dtype='object')

In [None]:
ser2.index

Index(['USA', 'Germany', 'Italy', 'Japan'], dtype='object')

In [None]:
ser1.add(ser2)

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

In [None]:
ser1 * ser2

Germany     4.0
Italy       NaN
Japan      16.0
USA         1.0
USSR        NaN
dtype: float64

In [None]:
ser1.std()

1.2909944487358056

In [None]:
ser1.max()

4

In [None]:
ser4 = pd.Series(np.random.random(5))

In [None]:
ser4

0    0.011203
1    0.684586
2    0.412322
3    0.172761
4    0.505440
dtype: float64

In [None]:
ser4[0] * ser2['Germany']

0.022405854503296352

In [None]:
from scipy import stats
stats.mode(ser1)

ModeResult(mode=array([1]), count=array([1]))

In [None]:
sum(ser1), max(ser1)

(10, 4)

In [None]:
ser1.median()

2.5

In [None]:
np.cos(ser2)

USA        0.540302
Germany   -0.416147
Italy      0.283662
Japan     -0.653644
dtype: float64

# Data Frames

In [None]:
from numpy.random import randn

In [None]:
df1 = pd.DataFrame(data = randn(5, 4), index = 'A B C D E'.split(), columns= 'w x y z'.split())

In [None]:
df1

Unnamed: 0,w,x,y,z
A,0.040653,0.129842,-0.60177,-0.606664
B,0.262595,-0.15214,-0.283796,-1.239393
C,0.653475,0.328819,-0.113489,-0.341411
D,-1.007972,0.399544,0.146413,-0.954584
E,-0.333404,-1.024035,1.946649,0.4057


In [None]:
d

{'a': 10, 'b': 20, 'c': 30}

In [None]:
player_names = ['J Bumrah', 'MA Agrawal', 'R Ashwin', 'V Kohli',' RA Jadeja']
mat = [20, 14, 79, 92, 52]
runs = [43, 1052, 2865, 7547, 1985]
SR   = [21.93,54.47, 53.76, 61.47, 56.85]

In [None]:
cricket = {'player': player_names, 'mat': mat, 'Runs': runs, 'SR': SR}

In [None]:
cr = pd.DataFrame(cricket)
cr

Unnamed: 0,player,mat,Runs,SR
0,J Bumrah,20,43,21.93
1,MA Agrawal,14,1052,54.47
2,R Ashwin,79,2865,53.76
3,V Kohli,92,7547,61.47
4,RA Jadeja,52,1985,56.85


# Data I/O

__reading csv file__

In [None]:
df = pd.read_csv('LifeExpectancy.csv')
df

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,584.259210,33736494.0
1,Afghanistan,2014,59.9,612.696514,327582.0
2,Afghanistan,2013,59.9,631.744976,31731688.0
3,Afghanistan,2012,59.5,669.959000,3696958.0
4,Afghanistan,2011,59.2,63.537231,2978599.0
...,...,...,...,...,...
2933,Zimbabwe,2004,44.3,454.366654,12777511.0
2934,Zimbabwe,2003,44.5,453.351155,12633897.0
2935,Zimbabwe,2002,44.8,57.348340,125525.0
2936,Zimbabwe,2001,45.3,548.587312,12366165.0


__exporting to csv__

In [None]:
cr.to_csv('sample_cricket_data.csv', index = False)

In [None]:
cr = pd.read_csv('sample_cricket_data.csv')

__reading excel file__

In [None]:
excel = pd.read_excel('IBM-313 Marks.xlsx', sheet_name= 'Sheet1')

In [None]:
excel

Unnamed: 0,S.No.,MTE (25),Mini Project (25),Total (50),ETE (50),Total
0,1,5.00,20,25.00,12.0,37.00
1,2,11.05,20,31.05,26.0,57.05
2,3,8.10,20,28.10,14.0,42.10
3,4,6.00,10,16.00,13.0,29.00
4,5,11.35,20,31.35,17.0,48.35
...,...,...,...,...,...,...
74,75,12.05,10,22.05,20.0,42.05
75,76,12.25,10,22.25,28.0,50.25
76,77,1.75,10,11.75,,0.00
77,78,3.00,10,13.00,,0.00


In [None]:
excel.to_excel('IBM-313 Marks.xlsx', sheet_name='Sheet1')

In [None]:
df

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,584.259210,33736494.0
1,Afghanistan,2014,59.9,612.696514,327582.0
2,Afghanistan,2013,59.9,631.744976,31731688.0
3,Afghanistan,2012,59.5,669.959000,3696958.0
4,Afghanistan,2011,59.2,63.537231,2978599.0
...,...,...,...,...,...
2933,Zimbabwe,2004,44.3,454.366654,12777511.0
2934,Zimbabwe,2003,44.5,453.351155,12633897.0
2935,Zimbabwe,2002,44.8,57.348340,125525.0
2936,Zimbabwe,2001,45.3,548.587312,12366165.0


# retrieving Information of Series / Data Frames

`Basic Information`

shape, index, columns, info, count

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country           2938 non-null   object 
 1   Year              2938 non-null   int64  
 2   Life expectancy   2928 non-null   float64
 3   GDP               2490 non-null   float64
 4   Population        2286 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 114.9+ KB


In [None]:
df.shape

(2938, 5)

In [None]:
df.index

RangeIndex(start=0, stop=2938, step=1)

In [None]:
df.columns

Index(['Country', 'Year', 'Life expectancy ', 'GDP', 'Population'], dtype='object')

In [None]:
df.count()  # non - null counts

Country             2938
Year                2938
Life expectancy     2928
GDP                 2490
Population          2286
dtype: int64

`Summary`

sum, cumsum, min, max, idxmin, idxmax, describe, mean, median

In [None]:
df.sum()

Country             AfghanistanAfghanistanAfghanistanAfghanistanAf...
Year                                                          5898090
Life expectancy                                              202690.6
GDP                                                   18633064.588155
Population                                         29154215524.440002
dtype: object

In [None]:
df.min()

Country             Afghanistan
Year                       2000
Life expectancy            36.3
GDP                     1.68135
Population                 34.0
dtype: object

In [None]:
df.max()

Country                 Zimbabwe
Year                        2015
Life expectancy             89.0
GDP                  119172.7418
Population          1293859294.0
dtype: object

In [None]:
df.mean()

Year                2.007519e+03
Life expectancy     6.922493e+01
GDP                 7.483158e+03
Population          1.275338e+07
dtype: float64

In [None]:
df.median()

Year                2.008000e+03
Life expectancy     7.210000e+01
GDP                 1.766948e+03
Population          1.386542e+06
dtype: float64

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,584.259210,33736494.0
1,Afghanistan,2014,59.9,612.696514,327582.0
2,Afghanistan,2013,59.9,631.744976,31731688.0
3,Afghanistan,2012,59.5,669.959000,3696958.0
4,Afghanistan,2011,59.2,63.537231,2978599.0
...,...,...,...,...,...
2933,Zimbabwe,2004,44.3,454.366654,12777511.0
2934,Zimbabwe,2003,44.5,453.351155,12633897.0
2935,Zimbabwe,2002,44.8,57.348340,125525.0
2936,Zimbabwe,2001,45.3,548.587312,12366165.0


In [None]:
excel.idxmax()

S.No.                78
MTE (25)             25
Mini Project (25)    12
Total (50)           25
ETE (50)             24
Total                24
dtype: int64

In [None]:
excel.idxmin()

S.No.                 0
MTE (25)             48
Mini Project (25)     3
Total (50)           71
ETE (50)             73
Total                76
dtype: int64

In [None]:
df.cumsum()

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,5.842592e+02,3.373649e+07
1,AfghanistanAfghanistan,4029,124.9,1.196956e+03,3.406408e+07
2,AfghanistanAfghanistanAfghanistan,6042,184.8,1.828701e+03,6.579576e+07
3,AfghanistanAfghanistanAfghanistanAfghanistan,8054,244.3,2.498660e+03,6.949272e+07
4,AfghanistanAfghanistanAfghanistanAfghanistanAf...,10065,303.5,2.562197e+03,7.247132e+07
...,...,...,...,...,...
2933,AfghanistanAfghanistanAfghanistanAfghanistanAf...,5890084,202510.0,1.863146e+07,2.911687e+10
2934,AfghanistanAfghanistanAfghanistanAfghanistanAf...,5892087,202554.5,1.863191e+07,2.912950e+10
2935,AfghanistanAfghanistanAfghanistanAfghanistanAf...,5894089,202599.3,1.863197e+07,2.912963e+10
2936,AfghanistanAfghanistanAfghanistanAfghanistanAf...,5896090,202644.6,1.863252e+07,2.914199e+10


In [None]:
df.describe()

Unnamed: 0,Year,Life expectancy,GDP,Population
count,2938.0,2928.0,2490.0,2286.0
mean,2007.51872,69.224932,7483.158469,12753380.0
std,4.613841,9.523867,14270.169342,61012100.0
min,2000.0,36.3,1.68135,34.0
25%,2004.0,63.1,463.935626,195793.2
50%,2008.0,72.1,1766.947595,1386542.0
75%,2012.0,75.7,5910.806335,7420359.0
max,2015.0,89.0,119172.7418,1293859000.0


# Selecting and Indexing

In [None]:
df.columns

Index(['Country', 'Year', 'Life expectancy ', 'GDP', 'Population'], dtype='object')

### First 5 rows : `head` 

In [None]:
df.head()

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,584.25921,33736494.0
1,Afghanistan,2014,59.9,612.696514,327582.0
2,Afghanistan,2013,59.9,631.744976,31731688.0
3,Afghanistan,2012,59.5,669.959,3696958.0
4,Afghanistan,2011,59.2,63.537231,2978599.0


In [None]:
df.head(2)

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,584.25921,33736494.0
1,Afghanistan,2014,59.9,612.696514,327582.0


### Last 5 rows: `tail`

In [None]:
df.tail()

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
2933,Zimbabwe,2004,44.3,454.366654,12777511.0
2934,Zimbabwe,2003,44.5,453.351155,12633897.0
2935,Zimbabwe,2002,44.8,57.34834,125525.0
2936,Zimbabwe,2001,45.3,548.587312,12366165.0
2937,Zimbabwe,2000,46.0,547.358878,12222251.0


In [None]:
df.tail(3)

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
2935,Zimbabwe,2002,44.8,57.34834,125525.0
2936,Zimbabwe,2001,45.3,548.587312,12366165.0
2937,Zimbabwe,2000,46.0,547.358878,12222251.0


### Ranging data index

In [None]:
df[25: 31]

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
25,Albania,2006,74.2,35.1293,2992547.0
26,Albania,2005,73.5,279.142931,311487.0
27,Albania,2004,73.0,2416.588235,326939.0
28,Albania,2003,72.8,189.681557,339616.0
29,Albania,2002,73.3,1453.642777,3511.0
30,Albania,2001,73.6,1326.97339,36173.0


### random value :  `sample`

In [None]:
df.sample()

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
2667,Tunisia,2013,74.9,4199.47253,1114558.0


In [None]:
df.sample(8)

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
1208,Indonesia,2009,67.9,2254.44566,23934478.0
2933,Zimbabwe,2004,44.3,454.366654,12777511.0
1371,Kenya,2006,55.3,697.66385,37525.0
482,Cameroon,2013,56.4,1365.34419,21655715.0
1468,Lebanon,2005,73.9,5339.441291,3986852.0
17,Albania,2014,77.5,4575.763787,288914.0
2280,Serbia,2000,72.6,87.136525,7516346.0
59,Angola,2004,47.1,141.86844,18865716.0


### Selecting Columns

In [None]:
df[['GDP','Population']].head(3)

Unnamed: 0,GDP,Population
0,584.25921,33736494.0
1,612.696514,327582.0
2,631.744976,31731688.0


### Series as data frame columns

In [None]:
df['GDP'][:10]

0    584.259210
1    612.696514
2    631.744976
3    669.959000
4     63.537231
5    553.328940
6    445.893298
7    373.361116
8    369.835796
9    272.563770
Name: GDP, dtype: float64

In [None]:
df.GDP.sample()

1208    2254.44566
Name: GDP, dtype: float64

### Creating new column in df

In [None]:
df['new_col'] = df['GDP'] * df['Population']

In [None]:
df.head()

Unnamed: 0,Country,Year,Life expectancy,GDP,Population,new_col
0,Afghanistan,2015,65.0,584.25921,33736494.0,19710860000.0
1,Afghanistan,2014,59.9,612.696514,327582.0,200708300.0
2,Afghanistan,2013,59.9,631.744976,31731688.0,20046330000.0
3,Afghanistan,2012,59.5,669.959,3696958.0,2476810000.0
4,Afghanistan,2011,59.2,63.537231,2978599.0,189251900.0


In [None]:
del df['new_col']

In [None]:
df.head()

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
0,Afghanistan,2015,65.0,584.25921,33736494.0
1,Afghanistan,2014,59.9,612.696514,327582.0
2,Afghanistan,2013,59.9,631.744976,31731688.0
3,Afghanistan,2012,59.5,669.959,3696958.0
4,Afghanistan,2011,59.2,63.537231,2978599.0


### Selecting Rows

`loc`

In [None]:
df.index.values

array([   0,    1,    2, ..., 2935, 2936, 2937])

In [None]:
df.loc[287]

Country                  Benin
Year                      2000
Life expectancy           55.4
GDP                 374.192394
Population           6865951.0
Name: 287, dtype: object

In [None]:
# if you want in 2d form 
df.loc[[287]]

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
287,Benin,2000,55.4,374.192394,6865951.0


`iloc`

In [None]:
df.iloc[287]

Country                  Benin
Year                      2000
Life expectancy           55.4
GDP                 374.192394
Population           6865951.0
Name: 287, dtype: object

In [None]:
df.iloc[[287]]

Unnamed: 0,Country,Year,Life expectancy,GDP,Population
287,Benin,2000,55.4,374.192394,6865951.0


In [None]:
df1

Unnamed: 0,w,x,y,z
A,0.040653,0.129842,-0.60177,-0.606664
B,0.262595,-0.15214,-0.283796,-1.239393
C,0.653475,0.328819,-0.113489,-0.341411
D,-1.007972,0.399544,0.146413,-0.954584
E,-0.333404,-1.024035,1.946649,0.4057


In [None]:
df1.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [None]:
df1.iloc[[1]]

Unnamed: 0,w,x,y,z
B,0.262595,-0.15214,-0.283796,-1.239393


In [None]:
df1.loc[['B']]

Unnamed: 0,w,x,y,z
B,0.262595,-0.15214,-0.283796,-1.239393


In [None]:
df1.loc['B']

w    0.262595
x   -0.152140
y   -0.283796
z   -1.239393
Name: B, dtype: float64

In [None]:
df1.iloc[1]

w    0.262595
x   -0.152140
y   -0.283796
z   -1.239393
Name: B, dtype: float64

In [None]:
df1.loc[['B','E']]

Unnamed: 0,w,x,y,z
B,0.262595,-0.15214,-0.283796,-1.239393
E,-0.333404,-1.024035,1.946649,0.4057


### Subsetting Rows and Cols

df1.loc[[rows], [cols]]

In [None]:
df1.loc[['B','E'], ['x','y']]

Unnamed: 0,x,y
B,-0.15214,-0.283796
E,-1.024035,1.946649


In [None]:
df.loc[[23, 87],['Country','Population','GDP']]

Unnamed: 0,Country,Population,GDP
23,Albania,2947314.0,437.539647
87,Argentina,4382389.0,8953.359275


### Boolean Indexing

In [None]:
df['GDP'] > 1893

0       False
1       False
2       False
3       False
4       False
        ...  
2933    False
2934    False
2935    False
2936    False
2937    False
Name: GDP, Length: 2938, dtype: bool

In [None]:
df[df['GDP'] > 1893].shape[0]

1212

In [None]:
excel.head()

Unnamed: 0,S.No.,MTE (25),Mini Project (25),Total (50),ETE (50),Total
0,1,5.0,20,25.0,12.0,37.0
1,2,11.05,20,31.05,26.0,57.05
2,3,8.1,20,28.1,14.0,42.1
3,4,6.0,10,16.0,13.0,29.0
4,5,11.35,20,31.35,17.0,48.35


In [None]:
shorted_list = excel[(excel['MTE (25)'] > 13.0) & (excel['ETE (50)']> 17.0)]

In [None]:
shorted_list[shorted_list['Total'] > 90.0]

Unnamed: 0,S.No.,MTE (25),Mini Project (25),Total (50),ETE (50),Total
24,26,22.5,22,44.5,50.0,94.5
25,27,23.5,22,45.5,47.0,92.5
