# A Simple Walk-through with Pandas for Data Science, Part 1

Complete tutorial accessible via: https://neuraspike.com/blog/simple-walk-through-with-pandas-for-data-science-part1/

In [None]:
import pandas as pd

print(pd.__version__)
# '1.1.4.'

## Creating Pandas Objects

In [None]:
import numpy as np

### pandas.Series()

#### Python Lists

In [None]:
data = pd.Series(data=[0, 1, 2, 3],
                  index=['first', 'second', 'third', 'fourth'])

print(data)

# ------ output -------
# first     0
# second    1
# third     2
# fourth    3
# dtype: int64

In [None]:
print(data.index) # get the index

# ------ output -------
# Index(['first', 'second', 'third', 'fourth'], dtype='object')

In [None]:
print(data.values) # get the values

# ------ output -------
# [0 1 2 3]

In [None]:
print(data[0]) # get the first value in a series
# ------ output -------
# 0

print(data[0:2]) # access the first 2 data stored in the series

# ------ output -------
# first     0
# second    1
# dtype: int64

In [None]:
print(data['first']) # get the first value in a series
# ------ output -------
# 0

print(data['first':'second']) # access the first 2 data stored in the series

# ------ output -------
# first     0
# second    1
# dtype: int64

#### Dictionaries

In [None]:
data_dict = {
    "first" : 0,
    "second" : 1,
    "third" : 2,
    "fourth" : 3
}

print(data_dict)

# ------ output -------
# {'first': 0, 'second': 1, 'third': 2, 'fourth': 3}

In [None]:
data = pd.Series(data_dict)
print(data)

# ------ output -------
# first     0
# second    1
# third     2
# fourth    3
# dtype: int64

#### NumPy Array

In [None]:
array = np.arange(4)
print(array)

# ------ output -------
# [0 1 2 3]


data = pd.Series(array)
print(data)

# ------ output -------
# 0    0
# 1    1
# 2    2
# 3    3
# dtype: int64

#### Scalar Value or Constant

In [None]:
print(pd.Series(10, index=[1, 2, 3, 4, 5]))

# ------ output -------
# 1    10
# 2    10
# 3    10
# 4    10
# 5    10
# dtype: int64

### pandas.DataFrame()

In [None]:
company_info1 = {
      'name' : 'Google',
      'year' : 1998
}

print(company_info1)

# ------ output -------
# {'name': 'Google', 'year': 1998}


company_info2 = {
      'name' : 'Facebook',
      'year' : 2004
}

print(company_info2)

# ------ output -------
# {'name': 'Facebook', 'year': 2004}

In [None]:
tech_companies = ['Google', 'Facebook', 'Nvidia', 'Microsoft']
year_founded = [1998, 2004, 1993, 1975]

companies_info = pd.DataFrame({ 
                               'name' : tech_companies,
                               'year' : year_founded 
                                })

print(companies_info)

# ------ output -------
#
#  	|   name 	  |   year |
# ------------------------
# 0 |	Google 	  |  1998  |
# 1 |	Facebook  |	 2004  |
# 2 |	Nvidia 	  |  1993  |
# 3 |	Microsoft |  1975  |

In [None]:
print(companies_info.index)

# ------ output -------
# RangeIndex(start=0, stop=4, step=1)


print(companies_info.columns)

# ------ output -------
# Index(['name', 'year'], dtype='object')

In [None]:
print(companies_info['name']) # get the values on the column 'name'

# ------ output -------
#  	0       Google
#   1     Facebook
#   2       Nvidia
#   3    Microsoft
# Name: name, dtype: object


print(companies_info['year']) # get the values on the column 'year'

# ------ output -------
#  	0     1998
#   1     2004
#   2     1993
#   3     1975
# Name: name, dtype: object

### pandas.Index()

In [None]:
index = pd.Index(np.arange(6))
print(index)

# ------ output -------
# Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [None]:
print(index[2])

# ------ output -------
# 2

print(index[2:4])

# ------ output -------
# Int64Index([2, 3], dtype='int64')

In [None]:
print(index.shape) # get the shape
print(index.size) # get the size
print(index.ndim) #  shape of the dimension
print(index.dtype) # data type

# ------ output -------
# (6, )
# 6
# 1
# int64

In [None]:
index[0] = 2

# ------ output -------
# TypeError: Index does not support mutable operations

## Data Indexing and Selection

#### Data Indexing and selection in Series

In [None]:
continent = pd.Series(data=['Africa', 'Europe', 'Asia', 'North America'],
                      index=['Nigeria', 'Serbia', 'China', 'USA'])
print(continent)

# ------ output -------
# Nigeria           Africa
# Serbia            Europe
# China               Asia
# USA        North America
# dtype: object

In [None]:
continent['Suriname'] = "South America"
print(continent)

# ------ output -------
# Nigeria            Africa
# Serbia             Europe
# China                Asia
# USA         North America
# Suriname    South America
# dtype: object

In [None]:
print(continent['Nigeria':'China'])

# ------ output -------
# Nigeria    Africa
# Serbia     Europe
# China        Asia
# dtype: object


print(continent[0:2])

# ------ output -------
# Nigeria    Africa
# Serbia     Europe
# China        Asia
# dtype: object

### Indexer: loc and iloc

In [None]:
print(continent.loc['Serbia': 'Suriname'])

# ------ output -------
# Serbia             Europe
# China                Asia
# USA         North America
# Suriname    South America
# dtype: object

In [None]:
print(continent.iloc[0])

# ------ output -------
# Africa


print(continent.iloc[0:3])

# ------ output -------
# Nigeria    Africa
# Serbia     Europe
# China        Asia
# dtype: object

#### Data Indexing and selection in DataFrames

In [None]:
continent = pd.Series(data=['Africa', 'Europe', 'Asia', 'North America'],
                      index=['Nigeria', 'Serbia', 'China', 'USA'])


language = pd.Series(data=['English', 'Serbian', 'Mandarin', 'English'],
                      index=['Nigeria', 'Serbia', 'China', 'USA'])

population = pd.Series(data=[195900000, 6964000, 1393000000, 328200000],
                      index=['Nigeria', 'Serbia', 'China', 'USA'])


data = pd.DataFrame({'continent': continent,
                     'language': language,
                     'population' : population})

print(data)

# ------ output -------
#              continent  language  population
# Nigeria         Africa   English   195900000
# Serbia          Europe   Serbian     6964000
# China             Asia  Mandarin  1393000000
# USA      North America   English   328200000

In [None]:
print(data.values[3])

# ------ output -------
# ['North America' 'English' 328200000]


In [None]:
print(data['language']) # OR print(data.language)

# ------ output -------
# Nigeria     English
# Serbia      Serbian
# China      Mandarin
# USA         English
# Name: language, dtype: object


In [None]:
print(data.iloc[:2, :2])

# ------ output -------
#         continent language
# Nigeria    Africa  English
# Serbia     Europe  Serbian


print(data.loc[:'China', 'language'])

# ------ output -------
# Nigeria     English
# Serbia      Serbian
# China      Mandarin
# Name: language, dtype: object

In [None]:
ten_million = 10000000

print(data[data.population > ten_million])

# ------ output -------
#              continent  language  population
# Nigeria         Africa   English   195900000
# China             Asia  Mandarin  1393000000
# USA      North America   English   328200000


In [None]:
print(data.population > ten_million)

# ------ output -------
# Nigeria     True
# Serbia     False
# China       True
# USA         True
# Name: population, dtype: bool

In [None]:

ten_million = 10000000

print(data.loc[data.population > ten_million, ['continent']])

# ------ output -------
#              continent
# Nigeria         Africa
# China             Asia
# USA      North America

In [None]:
print(data.iloc[0, 2] )

# ------ output -------
# 195900000


data.iloc[0, 2] = 195900123
print(data.iloc[0, 2] )

# ------ output -------
# 195900123