# Introduction to Numpy and Pandas

## numpy

In [1]:
import numpy as np
arr = np.array([11,22,33])  # 1D
arr

array([11, 22, 33])

In [2]:
type(arr)

numpy.ndarray

In [4]:
# 2D array
list_data = [[11, 22, 33], [44,55,66]]
arr = np.array(list_data)
arr

array([[11, 22, 33],
       [44, 55, 66]])

In [5]:
type(arr)

numpy.ndarray

In [6]:
arr.shape  #(rows, columns)

(2, 3)

In [8]:
# list
list(range(10,20,2))

[10, 12, 14, 16, 18]

In [10]:
# generate an array
np.arange(10,20,2)  # arange(start, stop, step)

array([10, 12, 14, 16, 18])

In [11]:
np.zeros(4)

array([0., 0., 0., 0.])

In [12]:
np.zeros((3,4))  # zeros(shape)

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [14]:
# linearly/equally spaced vector (start,stop, no_of_values)
# returns number of values between start and stop including both start and stop
vector = np.linspace(0,25,4)
vector

array([ 0.        ,  8.33333333, 16.66666667, 25.        ])

In [15]:
vector = np.linspace(0,20,4)
vector

array([ 0.        ,  6.66666667, 13.33333333, 20.        ])

In [17]:
vector = np.linspace(0,20,5)
vector

array([ 0.,  5., 10., 15., 20.])

In [18]:
# how to find the data type of an array
# arr.dtype
vector.dtype

dtype('float64')

In [23]:
# reshaping of an array
arr_1d = np.zeros(8)
print(arr_1d)
arr_3d = arr_1d.reshape((2,2,2))
print(arr_3d)

[0. 0. 0. 0. 0. 0. 0. 0.]
[[[0. 0.]
  [0. 0.]]

 [[0. 0.]
  [0. 0.]]]


In [25]:
arr_1d = np.zeros(8)
print(arr_1d)
print("#"*20)
arr_2d = arr_1d.reshape((2,4))
print(arr_2d)

[0. 0. 0. 0. 0. 0. 0. 0.]
####################
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [26]:
# flatten the array
print(arr_2d)
print("*"*20)
arr_1d = arr_2d.ravel()
print(arr_1d)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]
********************
[0. 0. 0. 0. 0. 0. 0. 0.]


In [28]:
# 3D to 2D
# reshaping of an array
arr_1d = np.zeros(8)
print(arr_1d)
arr_3d = arr_1d.reshape((2,2,2))
print(arr_3d)

arr_2d = arr_3d.reshape((4,2))
print("#######")
print(arr_2d)

[0. 0. 0. 0. 0. 0. 0. 0.]
[[[0. 0.]
  [0. 0.]]

 [[0. 0.]
  [0. 0.]]]
#######
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


### Indexing and Slicing

In [29]:
arr = np.array([11,22, 33, 44, 55, 66, 77, 88, 99])
arr

array([11, 22, 33, 44, 55, 66, 77, 88, 99])

In [30]:
arr[2]

33

In [31]:
arr[1:7]  # 1 to 6


array([22, 33, 44, 55, 66, 77])

In [32]:
arr[:3]

array([11, 22, 33])

In [33]:
arr = np.array([11,22, 33, 44, 55, 66, 77, 88, 99])
slice_obj = slice(1,10,2)
arr[slice_obj]

array([22, 44, 66, 88])

In [35]:
# accessing multiple values together
list_of_indexes = [1,2,3]
arr[list_of_indexes]

array([22, 33, 44])

In [36]:
arr[[3,4,6]]

array([44, 55, 77])

### slicing 2D array


In [37]:
arr = np.array([[11,22,33,44],[55, 66, 77, 88], [1, 2, 3, 4], [100, 200, 300, 400]])
arr

array([[ 11,  22,  33,  44],
       [ 55,  66,  77,  88],
       [  1,   2,   3,   4],
       [100, 200, 300, 400]])

In [38]:
#syntax: arr[row_slice, colum_slice]
arr[0:3,0:2]

array([[11, 22],
       [55, 66],
       [ 1,  2]])

In [39]:
arr[0:3,0:3]

array([[11, 22, 33],
       [55, 66, 77],
       [ 1,  2,  3]])

In [40]:
arr = np.array([11,22, 33, 44, 55, 66, 77, 88, 99])
arr

array([11, 22, 33, 44, 55, 66, 77, 88, 99])

In [44]:
# slicing using negative indexes
arr[-1:-10:-3] # arr[1], arr[4], arr[7]

array([99, 66, 33])

## array attributes

In [47]:
arr = np.array([11, 22, 33])
print(arr.dtype)

int64


In [48]:
# choosing the dtype
arr = np.array([11, 22, 33], dtype=np.int8)
print(arr)
print(arr.dtype)

[11 22 33]
int8


In [49]:
# choosing the dtype
arr = np.array([11, 22, 33], dtype=np.float16)
print(arr)
print(arr.dtype)

[11. 22. 33.]
float16


In [52]:
arr = np.array([[2,3, 4], [4,5, 6]])
# shape
arr.shape

(2, 3)

In [53]:
# dimensions
arr.ndim

2

In [56]:
# size of each element
arr.itemsize

8

In [55]:
arr.dtype

dtype('int64')

### empty array

In [64]:
arr = np.empty([3,4])
arr  # returns a random value array or uninitialsed array
# most of the time elements would be 0 but not necessarily.

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Reading and Writing from text files

In [70]:
# reading a text file and loading it in array
arr = np.loadtxt('filex.txt', dtype=np.int8)
arr

array([[ 1,  2,  3,  4,  5,  6,  7],
       [ 8,  3,  4,  5,  5,  1,  8],
       [ 6,  3,  4,  5,  5,  1,  8],
       [ 6,  3,  4,  5,  5,  1,  8],
       [ 4,  3,  4,  5,  5,  1, 10]], dtype=int8)

In [79]:
# export an array to a text file
# 'd' for numbers
np.savetxt('out_integers.txt', arr,fmt="%d")

In [80]:
# float value 'f' upto 2 decimal level
np.savetxt('out_float.txt', arr,fmt="%.2f")

### exporting to CSV file, add delimiter=','


In [81]:
np.savetxt('out_int.csv', arr,fmt="%d", delimiter=',')

In [82]:
np.savetxt('out_dollar.txt', arr,fmt="%d", delimiter='$')

In [84]:
# loading data from CSV file
# 1. np.genfromtxt()
arr = np.genfromtxt('out_int.csv', delimiter=',')
arr

array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
       [ 8.,  3.,  4.,  5.,  5.,  1.,  8.],
       [ 6.,  3.,  4.,  5.,  5.,  1.,  8.],
       [ 6.,  3.,  4.,  5.,  5.,  1.,  8.],
       [ 4.,  3.,  4.,  5.,  5.,  1., 10.]])

In [85]:
# loading data from CSV file
# 2. np.loadtxt()
arr = np.loadtxt('out_int.csv', delimiter=',')
arr

array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
       [ 8.,  3.,  4.,  5.,  5.,  1.,  8.],
       [ 6.,  3.,  4.,  5.,  5.,  1.,  8.],
       [ 6.,  3.,  4.,  5.,  5.,  1.,  8.],
       [ 4.,  3.,  4.,  5.,  5.,  1., 10.]])

# pandas

In [86]:
import pandas as pd
#1 using list
data = [11, 22, 33, 44]
ser = pd.Series(data)
ser

0    11
1    22
2    33
3    44
dtype: int64

In [87]:
type(ser)

pandas.core.series.Series

In [88]:
pd.Series()  # empty series

Series([], dtype: float64)

In [89]:
# 2. using array
arr = np.array([100, 200, 300, 400])
pd.Series(arr)

0    100
1    200
2    300
3    400
dtype: int64

In [90]:
# choosing the indexes
# using a dict
data = {'a':100, 'b':200, 'c':400}
ser = pd.Series(data)
ser

a    100
b    200
c    400
dtype: int64

In [91]:
ser['a'] # accessing value using index

100

In [94]:
# choosing index by providing list of index names
data = [11, 22, 33]
ser = pd.Series(data, index=['aa', 'bb', 'cc'])
ser

aa    11
bb    22
cc    33
dtype: int64

In [93]:
ser['aa']

11

In [95]:
# slicing
ser = pd.Series([11, 22, 33, 44, 55, 66, 77, 88])
ser[1:5]  #1,2,3,4

1    22
2    33
3    44
4    55
dtype: int64

In [96]:
ser[:3]  # 0,1,2

0    11
1    22
2    33
dtype: int64

In [97]:
ser[4:] # from index 4 till the end

4    55
5    66
6    77
7    88
dtype: int64

## DataFrame

In [98]:
data = [10, 20, 30, 40]
df = pd.DataFrame(data)
df

Unnamed: 0,0
0,10
1,20
2,30
3,40


In [99]:
# list of dictionaries
data = [{'a':1, 'b':2}, {'a':10, 'b':20, 'c':30}]
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,2,
1,10,20,30.0


In [100]:
# NaN: not a number

In [101]:
# giving user defined row index 
data = [{'a':1, 'b':2}, {'a':10, 'b':20, 'c':30}]
df = pd.DataFrame(data, index=['first', 'second'])
df

Unnamed: 0,a,b,c
first,1,2,
second,10,20,30.0


In [102]:
# converting a series into a DataFrame
data = {'one':pd.Series([10,20,30], index=['a', 'b', 'c']),
       'two':pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
a,10.0,10
b,20.0,20
c,30.0,30
d,,40


In [103]:
# column addition
data = {'one':pd.Series([10,20,30], index=['a', 'b', 'c']),
       'two':pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
a,10.0,10
b,20.0,20
c,30.0,30
d,,40


In [104]:
# adding new column
df['srinivas'] = pd.Series([11, 22, 33], index=['a','b', 'c'])
df

Unnamed: 0,one,two,srinivas
a,10.0,10,11.0
b,20.0,20,22.0
c,30.0,30,33.0
d,,40,


In [105]:
df['gaurav'] = [100, 200, 300, 400]
df

Unnamed: 0,one,two,srinivas,gaurav
a,10.0,10,11.0,100
b,20.0,20,22.0,200
c,30.0,30,33.0,300
d,,40,,400


In [106]:
# access a column: df[<column_name>]
df['gaurav']

a    100
b    200
c    300
d    400
Name: gaurav, dtype: int64

In [107]:
# deleting a column
# 1. using "del"
del df['one']
df

Unnamed: 0,two,srinivas,gaurav
a,10,11.0,100
b,20,22.0,200
c,30,33.0,300
d,40,,400


In [112]:
# 2. using pop()
data = {'one':pd.Series([10,20,30], index=['a', 'b', 'c']),
       'two':pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
df
print("deleted column")
print(df.pop('two'))
print("#"*20)
print(df)

deleted column
a    10
b    20
c    30
d    40
Name: two, dtype: int64
####################
    one
a  10.0
b  20.0
c  30.0
d   NaN


## Row selection in a DataFrame
- loc[Row_label]: Row label is the user defined named for rows
- iloc[Row_Position]: Row position is position of row from the top of the dataframe starting from 0.

In [114]:
data = {'one':pd.Series([10,20,30], index=['a', 'b', 'c']),
       'two':pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
a,10.0,10
b,20.0,20
c,30.0,30
d,,40


In [115]:
df.loc['a'] # returns row with the label 'a' ie first row

one    10.0
two    10.0
Name: a, dtype: float64

In [116]:
df.loc['d'] # returns row with the label 'd' i.e last row

one     NaN
two    40.0
Name: d, dtype: float64

In [117]:
# using iloc
df.iloc[0]  # equivalent to -> df.loc['a'] 

one    10.0
two    10.0
Name: a, dtype: float64

In [118]:
df.iloc[3]  # equivalent to -> df.loc['d'] 

one     NaN
two    40.0
Name: d, dtype: float64

### Row addition

In [122]:
data = {'one':pd.Series([1,2,3], index=['a', 'b', 'c']),
       'two':pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd']),
       'three':pd.Series([10,20,30], index=['a', 'b', 'c'])}
df1 = pd.DataFrame(data)
df1

Unnamed: 0,one,two,three
a,1.0,10,10.0
b,2.0,20,20.0
c,3.0,30,30.0
d,,40,


In [123]:
df2 = pd.DataFrame([[11,12], [13,14]], columns=['two', 'three'])
df2

Unnamed: 0,two,three
0,11,12
1,13,14


In [125]:
# appending the df2 to df1
new_df = df1.append(df2, sort=True)
new_df

Unnamed: 0,one,three,two
a,1.0,10.0,10
b,2.0,20.0,20
c,3.0,30.0,30
d,,,40
0,,12.0,11
1,,14.0,13


In [127]:
new_df.loc['a']

one       1.0
three    10.0
two      10.0
Name: a, dtype: float64

In [128]:
new_df.loc[0]

one       NaN
three    12.0
two      11.0
Name: 0, dtype: float64

In [129]:
new_df.iloc[0]

one       1.0
three    10.0
two      10.0
Name: a, dtype: float64

In [132]:
new_df.iloc[0]

one       1.0
three    10.0
two      10.0
Name: a, dtype: float64

In [133]:
new_df.iloc[0]['one']

1.0

In [134]:
new_df.iloc[0]['three']

10.0

In [135]:
new_df.iloc[4]

one       NaN
three    12.0
two      11.0
Name: 0, dtype: float64

#### deleting a row
- df.drop(row_label OR row_index)
- drop method can also be used to delete a column syntax: df.drop(column_name, axis=1)

In [137]:
data = {'one':pd.Series([10,20,30], index=['a', 'b', 'c']),
       'two':pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
a,10.0,10
b,20.0,20
c,30.0,30
d,,40


In [139]:
new_df = df.drop('a')  # axis=0 by default
new_df

Unnamed: 0,one,two
b,20.0,20
c,30.0,30
d,,40


In [140]:
df = df.drop('two', axis=1)
df

Unnamed: 0,one
a,10.0
b,20.0
c,30.0
d,


## Loading CSVfile to DataFrame

In [144]:
df = pd.read_csv(r'/Users/gyanendrakumar/PycharmProjects/education_DS/Module_5/AllCountries.csv')
df
        

Unnamed: 0,Country,LandArea,Population,GDP,Rural,Internet,BirthRate,DeathRate,ElderlyPop,LifeExpectancy,FemaleLabor,Unemployment
0,Afghanistan,652.860,30.552,665.0,74.1,5.9,34.1,7.8,2.3,60.9,16.2,
1,Albania,27.400,2.897,4460.0,44.6,60.1,12.9,6.9,10.7,77.5,51.7,13.4
2,Algeria,2381.740,39.208,5361.0,30.5,16.5,24.3,5.9,4.6,71.0,16.2,11.0
3,American Samoa,0.200,0.055,,12.7,,,,,,,
4,Andorra,0.470,0.079,,13.8,94.0,,,,,,
5,Angola,1246.700,21.472,5783.0,57.5,19.1,44.1,13.9,2.4,51.9,64.5,
6,Antigua and Barbuda,0.440,0.090,13342.0,75.4,63.4,16.5,6.1,7.1,75.8,,
7,Argentina,2736.690,41.446,14715.0,8.5,59.9,16.8,7.7,10.9,76.2,55.1,7.2
8,Armenia,28.470,2.977,3505.0,37.0,46.3,13.7,8.7,10.3,74.5,58.4,
9,Aruba,0.180,0.103,,57.9,78.9,10.3,8.4,11.3,75.3,,


In [146]:
df.head() # by default first 5 rows

Unnamed: 0,Country,LandArea,Population,GDP,Rural,Internet,BirthRate,DeathRate,ElderlyPop,LifeExpectancy,FemaleLabor,Unemployment
0,Afghanistan,652.86,30.552,665.0,74.1,5.9,34.1,7.8,2.3,60.9,16.2,
1,Albania,27.4,2.897,4460.0,44.6,60.1,12.9,6.9,10.7,77.5,51.7,13.4
2,Algeria,2381.74,39.208,5361.0,30.5,16.5,24.3,5.9,4.6,71.0,16.2,11.0
3,American Samoa,0.2,0.055,,12.7,,,,,,,
4,Andorra,0.47,0.079,,13.8,94.0,,,,,,


In [148]:
df.head(2)

Unnamed: 0,Country,LandArea,Population,GDP,Rural,Internet,BirthRate,DeathRate,ElderlyPop,LifeExpectancy,FemaleLabor,Unemployment
0,Afghanistan,652.86,30.552,665.0,74.1,5.9,34.1,7.8,2.3,60.9,16.2,
1,Albania,27.4,2.897,4460.0,44.6,60.1,12.9,6.9,10.7,77.5,51.7,13.4


In [149]:
df['GDP']

0        665.0
1       4460.0
2       5361.0
3          NaN
4          NaN
5       5783.0
6      13342.0
7      14715.0
8       3505.0
9          NaN
10     67463.0
11     50511.0
12      7812.0
13     22312.0
14     24689.0
15       958.0
16         NaN
17      7575.0
18     46930.0
19      4894.0
20       805.0
21         NaN
22      2363.0
23      2868.0
24      4662.0
25      7315.0
26     11208.0
27     38563.0
28      7499.0
29       761.0
        ...   
185    60381.0
186    84748.0
187        NaN
188     1037.0
189      913.0
190     5779.0
191        NaN
192      636.0
193     4427.0
194    18373.0
195     4317.0
196    10972.0
197     7987.0
198        NaN
199     3880.0
200      657.0
201     3900.0
202    43049.0
203    41781.0
204    53042.0
205    16351.0
206     1878.0
207     3277.0
208    14415.0
209     1911.0
210        NaN
211        NaN
212     1473.0
213     1845.0
214      953.0
Name: GDP, Length: 215, dtype: float64

In [150]:
df[['Country','GDP']]

Unnamed: 0,Country,GDP
0,Afghanistan,665.0
1,Albania,4460.0
2,Algeria,5361.0
3,American Samoa,
4,Andorra,
5,Angola,5783.0
6,Antigua and Barbuda,13342.0
7,Argentina,14715.0
8,Armenia,3505.0
9,Aruba,


In [151]:
df[['Country','GDP']].head()

Unnamed: 0,Country,GDP
0,Afghanistan,665.0
1,Albania,4460.0
2,Algeria,5361.0
3,American Samoa,
4,Andorra,


In [152]:
# reading non-csv file
df = pd.read_csv('anand.txt', sep='#')
df

Unnamed: 0,1,2,3,4
0,1,2,3,4
1,1,2,3,4
2,1,2,3,4


In [155]:
df = pd.read_csv('anand.txt', sep='#', header=None)
df

Unnamed: 0,0,1,2,3
0,10,20,30,40
1,10,20,30,40
2,10,20,30,40
3,10,20,30,40


In [158]:
df.to_csv('sujata.csv')

In [159]:
df.to_csv('sujata_without_rowindex.csv', index=None)

In [160]:
df.to_csv('sujata_1.csv', index=None, header=None)