In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Introduction


* Pandas contains data structures and data manipulation tools designed to make data cleaning and analysis fast and convenient in  python. pandas is often used in tandem with numerical computing tools like NumPy and SciPy, analytical libraries like statsmodels and scikit-learn, and data visualization libraries like matplotlib. pandas adopts significant parts of NumPy’s idiomatic style of array-based computing, especially array-based functions and a preference for data processing without for loops

* While pandas adopts many coding idioms from NumPy, the biggestabout difference 
is that pandas is designed for working with tabular or heterogeneous data. NumPy, b 
contrast, is best suited for working with homogeneously typed numerical array data.

In [3]:
import numpy as np
import pandas as pd

### Series

A Series is a one-dimensional array-like object containing a sequence of values of the same type and an associated array of data labels, 
called its index. The simplest Series is formed from only an array of data:

In [4]:
obj = pd.Series([4, 7, -5, 3])
print(obj)

# Since we did not specify an index for the data, a
# default one consisting of the integers 0 through N - 1 is created.

# You can get the array representation and index object of the Series via
# its array and index attributes, respectively:

print(obj.array)
print(obj.index)

0    4
1    7
2   -5
3    3
dtype: int64
<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64
RangeIndex(start=0, stop=4, step=1)


In [5]:
# Often, we’ll want to create a Series with an index identifying each data point with a
# label:

obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])

print(obj2)

# you can use labels in the index when selecting single
# values or a set of values

print(obj2["a"])
obj2["d"] = 6
print(obj2[["c", "a", "d"]])

print(np.exp(obj2))

d    4
b    7
a   -5
c    3
dtype: int64
-5
c    3
a   -5
d    6
dtype: int64
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [6]:
# Using NumPy-like operations, such as filtering with a Boolean
# array, scalar multiplication, or applying math functions, will preserve the index-value
# link:

print(obj2[obj2 > 0])

print( obj2 * 2)

d    6
b    7
c    3
dtype: int64
d    12
b    14
a   -10
c     6
dtype: int64


In [7]:
# Another way to think about a Series is as a fixed-length, ordered dictionary, as it is a
# mapping of index values to data values.
print("b" in obj2)
print("e" in obj2)

True
False


In [8]:
# Should you have data contained in a Python dictionary, you can create a Series from
# it by passing the dictionary:

sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

print(sdata)

obj3 = pd.Series(sdata)

print(obj3)

# A Series can be converted back to a dictionary with its to_dict method:

print(obj3.to_dict())

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}


In [9]:
# You can override the order you want the keys to appear in the resulting Series:

states = ["California", "Ohio", "Oregon", "Texas"]

obj4 = pd.Series(sdata, index=states)

print(obj4)

print("Here, three values found in sdata were placed in the appropriate locations, \n but since no value for 'California' was found, it appears as NaN (Not a Number)")

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
Here, three values found in sdata were placed in the appropriate locations, 
 but since no value for 'California' was found, it appears as NaN (Not a Number)


In [10]:
# The isna and notna functions in pandas should be used to detect missing data:

print(pd.isna(obj4))
print(pd.notna(obj4))

# Series also has these as instance methods:
print("\n Instance function")
print(obj4.isna())

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

 Instance function
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [11]:
print(obj3)

print(obj4)

print("\nArithmetic\n")
print(obj3 + obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

Arithmetic

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [12]:
# Series object and its index have a "name" attribute, which integrates with
# other areas of pandas functionality

obj4.name = "population"

obj4.index.name = "state"

print(obj4)

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [13]:
# A Series’s index can be altered in place by assignment

print(obj)

obj.index = ["Bob", "Steve", "Jeff", "Ryan"]

print(obj)

0    4
1    7
2   -5
3    3
dtype: int64
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


### DataFrame

* A DataFrame represents a rectangular table of data and contains an ordered, named
collection of columns, each of which can be a different value type.

* The DataFrame has both a row and column index; it can be thought of
as a dictionary of Series all sharing the same index.

* There are many ways to construct a DataFrame, though one of the most common is
from a dictionary of equal-length lists or NumPy arrays:

In [14]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
 "year": [2000, 2001, 2002, 2001, 2002, 2003],
 "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}


frame = pd.DataFrame(data)

print(frame)

# The resulting DataFrame will have its index assigned automatically, as with Series,
# and the columns are placed according to the order of the keys in data

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [15]:
# For large DataFrames, the head method selects only the first five rows:

print(frame.head())

# Similarly, tail returns the last five rows:

print(frame.tail())

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
    state  year  pop
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [16]:
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in
# that order:

pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [17]:
# If you pass a column that isn’t contained in the dictionary, it will appear with missing
# values in the result:

frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])

frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [18]:
# A column in a DataFrame can be retrieved as a Series either by dictionary-like
# notation or by using the dot attribute notation:

print(frame2['state'])

print(frame2.state)

print(frame2.columns) # Print all columns names


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [19]:
# Rows can also be retrieved by position or name with the special iloc and loc
# attributes

print(frame2.loc[1])

print(frame2.iloc[2])


year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object


In [23]:
# Columns can be modified by assignment.

frame2["debt"] = 16.5

print(frame2)

frame2["debt"] = np.arange(6.)

print(frame2)

   year   state  pop  debt
0  2000    Ohio  1.5  16.5
1  2001    Ohio  1.7  16.5
2  2002    Ohio  3.6  16.5
3  2001  Nevada  2.4  16.5
4  2002  Nevada  2.9  16.5
5  2003  Nevada  3.2  16.5
   year   state  pop  debt
0  2000    Ohio  1.5   0.0
1  2001    Ohio  1.7   1.0
2  2002    Ohio  3.6   2.0
3  2001  Nevada  2.4   3.0
4  2002  Nevada  2.9   4.0
5  2003  Nevada  3.2   5.0


In [26]:
# When you are assigning lists or arrays to a column, the value’s length must match the
# length of the DataFrame

val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])

print(val)

print(frame2)

frame2["debt"] = val

print(frame2)

two    -1.2
four   -1.5
five   -1.7
dtype: float64
   year   state  pop  debt
0  2000    Ohio  1.5   0.0
1  2001    Ohio  1.7   1.0
2  2002    Ohio  3.6   2.0
3  2001  Nevada  2.4   3.0
4  2002  Nevada  2.9   4.0
5  2003  Nevada  3.2   5.0
   year   state  pop  debt
0  2000    Ohio  1.5   NaN
1  2001    Ohio  1.7   NaN
2  2002    Ohio  3.6   NaN
3  2001  Nevada  2.4   NaN
4  2002  Nevada  2.9   NaN
5  2003  Nevada  3.2   NaN


In [28]:
# The del keyword will delete columns like with a dictionary.

frame2["eastern"] = frame2["state"] == "Ohio"

print(frame2)

del frame2["eastern"]

print(frame2)

print(frame2.columns)

   year   state  pop  debt  eastern
0  2000    Ohio  1.5   NaN     True
1  2001    Ohio  1.7   NaN     True
2  2002    Ohio  3.6   NaN     True
3  2001  Nevada  2.4   NaN    False
4  2002  Nevada  2.9   NaN    False
5  2003  Nevada  3.2   NaN    False
   year   state  pop  debt
0  2000    Ohio  1.5   NaN
1  2001    Ohio  1.7   NaN
2  2002    Ohio  3.6   NaN
3  2001  Nevada  2.4   NaN
4  2002  Nevada  2.9   NaN
5  2003  Nevada  3.2   NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')
