In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Introduction


* Pandas contains data structures and data manipulation tools designed to make data cleaning and analysis fast and convenient in  python. pandas is often used in tandem with numerical computing tools like NumPy and SciPy, analytical libraries like statsmodels and scikit-learn, and data visualization libraries like matplotlib. pandas adopts significant parts of NumPy’s idiomatic style of array-based computing, especially array-based functions and a preference for data processing without for loops

* While pandas adopts many coding idioms from NumPy, the biggestabout difference 
is that pandas is designed for working with tabular or heterogeneous data. NumPy, b 
contrast, is best suited for working with homogeneously typed numerical array data.

In [3]:
import numpy as np
import pandas as pd

### Series

A Series is a one-dimensional array-like object containing a sequence of values of the same type and an associated array of data labels, 
called its index. The simplest Series is formed from only an array of data:

In [4]:
obj = pd.Series([4, 7, -5, 3])
print(obj)

# Since we did not specify an index for the data, a
# default one consisting of the integers 0 through N - 1 is created.

# You can get the array representation and index object of the Series via
# its array and index attributes, respectively:

print(obj.array)
print(obj.index)

0    4
1    7
2   -5
3    3
dtype: int64
<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64
RangeIndex(start=0, stop=4, step=1)


In [5]:
# Often, we’ll want to create a Series with an index identifying each data point with a
# label:

obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])

print(obj2)

# you can use labels in the index when selecting single
# values or a set of values

print(obj2["a"])
obj2["d"] = 6
print(obj2[["c", "a", "d"]])

print(np.exp(obj2))

d    4
b    7
a   -5
c    3
dtype: int64
-5
c    3
a   -5
d    6
dtype: int64
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [6]:
# Using NumPy-like operations, such as filtering with a Boolean
# array, scalar multiplication, or applying math functions, will preserve the index-value
# link:

print(obj2[obj2 > 0])

print( obj2 * 2)

d    6
b    7
c    3
dtype: int64
d    12
b    14
a   -10
c     6
dtype: int64


In [7]:
# Another way to think about a Series is as a fixed-length, ordered dictionary, as it is a
# mapping of index values to data values.
print("b" in obj2)
print("e" in obj2)

True
False


In [8]:
# Should you have data contained in a Python dictionary, you can create a Series from
# it by passing the dictionary:

sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

print(sdata)

obj3 = pd.Series(sdata)

print(obj3)

# A Series can be converted back to a dictionary with its to_dict method:

print(obj3.to_dict())

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}


In [9]:
# You can override the order you want the keys to appear in the resulting Series:

states = ["California", "Ohio", "Oregon", "Texas"]

obj4 = pd.Series(sdata, index=states)

print(obj4)

print("Here, three values found in sdata were placed in the appropriate locations, \n but since no value for 'California' was found, it appears as NaN (Not a Number)")

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
Here, three values found in sdata were placed in the appropriate locations, 
 but since no value for 'California' was found, it appears as NaN (Not a Number)


In [10]:
# The isna and notna functions in pandas should be used to detect missing data:

print(pd.isna(obj4))
print(pd.notna(obj4))

# Series also has these as instance methods:
print("\n Instance function")
print(obj4.isna())

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

 Instance function
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [11]:
print(obj3)

print(obj4)

print("\nArithmetic\n")
print(obj3 + obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

Arithmetic

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [12]:
# Series object and its index have a "name" attribute, which integrates with
# other areas of pandas functionality

obj4.name = "population"

obj4.index.name = "state"

print(obj4)

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [13]:
# A Series’s index can be altered in place by assignment

print(obj)

obj.index = ["Bob", "Steve", "Jeff", "Ryan"]

print(obj)

0    4
1    7
2   -5
3    3
dtype: int64
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


### DataFrame

* A DataFrame represents a rectangular table of data and contains an ordered, named
collection of columns, each of which can be a different value type.

* The DataFrame has both a row and column index; it can be thought of
as a dictionary of Series all sharing the same index.

* There are many ways to construct a DataFrame, though one of the most common is
from a dictionary of equal-length lists or NumPy arrays:

In [14]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
 "year": [2000, 2001, 2002, 2001, 2002, 2003],
 "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}


frame = pd.DataFrame(data)

print(frame)

# The resulting DataFrame will have its index assigned automatically, as with Series,
# and the columns are placed according to the order of the keys in data

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [15]:
# For large DataFrames, the head method selects only the first five rows:

print(frame.head())

# Similarly, tail returns the last five rows:

print(frame.tail())

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
    state  year  pop
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [16]:
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in
# that order:

pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [17]:
# If you pass a column that isn’t contained in the dictionary, it will appear with missing
# values in the result:

frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])

frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [18]:
# A column in a DataFrame can be retrieved as a Series either by dictionary-like
# notation or by using the dot attribute notation:

print(frame2['state'])

print(frame2.state)

print(frame2.columns) # Print all columns names


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [19]:
# Rows can also be retrieved by position or name with the special iloc and loc
# attributes

print(frame2.loc[1])

print(frame2.iloc[2])


year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object


In [23]:
# Columns can be modified by assignment.

frame2["debt"] = 16.5

print(frame2)

frame2["debt"] = np.arange(6.)

print(frame2)

   year   state  pop  debt
0  2000    Ohio  1.5  16.5
1  2001    Ohio  1.7  16.5
2  2002    Ohio  3.6  16.5
3  2001  Nevada  2.4  16.5
4  2002  Nevada  2.9  16.5
5  2003  Nevada  3.2  16.5
   year   state  pop  debt
0  2000    Ohio  1.5   0.0
1  2001    Ohio  1.7   1.0
2  2002    Ohio  3.6   2.0
3  2001  Nevada  2.4   3.0
4  2002  Nevada  2.9   4.0
5  2003  Nevada  3.2   5.0


In [26]:
# When you are assigning lists or arrays to a column, the value’s length must match the
# length of the DataFrame

val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])

print(val)

print(frame2)

frame2["debt"] = val

print(frame2)

two    -1.2
four   -1.5
five   -1.7
dtype: float64
   year   state  pop  debt
0  2000    Ohio  1.5   0.0
1  2001    Ohio  1.7   1.0
2  2002    Ohio  3.6   2.0
3  2001  Nevada  2.4   3.0
4  2002  Nevada  2.9   4.0
5  2003  Nevada  3.2   5.0
   year   state  pop  debt
0  2000    Ohio  1.5   NaN
1  2001    Ohio  1.7   NaN
2  2002    Ohio  3.6   NaN
3  2001  Nevada  2.4   NaN
4  2002  Nevada  2.9   NaN
5  2003  Nevada  3.2   NaN


In [28]:
# The del keyword will delete columns like with a dictionary.

frame2["eastern"] = frame2["state"] == "Ohio"

print(frame2)

del frame2["eastern"]

print(frame2)

print(frame2.columns)

   year   state  pop  debt  eastern
0  2000    Ohio  1.5   NaN     True
1  2001    Ohio  1.7   NaN     True
2  2002    Ohio  3.6   NaN     True
3  2001  Nevada  2.4   NaN    False
4  2002  Nevada  2.9   NaN    False
5  2003  Nevada  3.2   NaN    False
   year   state  pop  debt
0  2000    Ohio  1.5   NaN
1  2001    Ohio  1.7   NaN
2  2002    Ohio  3.6   NaN
3  2001  Nevada  2.4   NaN
4  2002  Nevada  2.9   NaN
5  2003  Nevada  3.2   NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [31]:
# Another common form of data is a nested dictionary of dictionaries

populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

# pandas will interpret the outer dictionary keys as the columns, 
# and the inner keys as the row indices:

frame3 = pd.DataFrame(populations)

print(frame3)

# You can transpose the DataFrame (swap rows and columns)

print(frame3.T)

      Ohio  Nevada
2000   1.5     NaN
2001   1.7     2.4
2002   3.6     2.9
        2000  2001  2002
Ohio     1.5   1.7   3.6
Nevada   NaN   2.4   2.9


Possible data inputs to the DataFrame constructor

**2D ndarray** 

A matrix of data, passing optional row and column labels

**Dictionary of arrays, lists, ortuples**

Each sequence becomes a column in the DataFrame; all sequences must be the same length

**NumPy structured/recordarray**

Treated as the “dictionary of arrays” case

**Dictionary of Series** 

Each value becomes a column; indexes from each Series are unioned together to form the
result’s row index if no explicit index is passed

**Dictionary of dictionaries** 

Each inner dictionary becomes a column; keys are unioned to form the row index as in the
“dictionary of Series” case

**List of dictionaries or Series** 

Each item becomes a row in the DataFrame; unions of dictionary keys or Series indexes
become the DataFrame’s column labels

**List of lists or tuples** 

Treated as the “2D ndarray” case

**Another DataFrame** 

The DataFrame’s indexes are used unless different ones are passed

**NumPy MaskedArray** 

Like the “2D ndarray” case except masked values are missing in the DataFrame result


In [32]:
# Unlike Series, DataFrame does not have a name attribute. DataFrame’s to_numpy
# method returns the data contained in the DataFrame as a two-dimensional ndarray:

print(frame3.to_numpy())

[[1.5 nan]
 [1.7 2.4]
 [3.6 2.9]]


### Index Objects

* Pandas’s Index objects are responsible for holding the axis labels (including a Data
Frame’s column names) and other metadata (like the axis name or names).

*  Any arra 
or other sequence of labels you use when constructing a Series or DataFrame  s
internally converted to an Inex:

In [35]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])

index = obj.index

print(index)

# Index objects are immutable and thus can’t be modified by the user
index[1] = "d"

# Immutability makes it safer to share Index objects among data structures
# A pandas Index can contain duplicate labels
# Selections with duplicate labels will select all occurrences of that label.

Index(['a', 'b', 'c'], dtype='object')


TypeError: Index does not support mutable operations

Each Index has a number of methods and properties for set logic, which answer other
common questions about the data it contains. Some useful ones are

**append()**  Concatenate with additional Index objects, producing a new Index

**difference()** Compute set difference as an Index

**intersection()** Compute set intersection

**union()** Compute set union

**isin()** Compute Boolean array indicating whether each value is contained in the passed collection

**delete()** Compute new Index with element at Index i deleted

**drop()** Compute new Index by deleting passed values

**insert()** Compute new Index by inserting element at Index 

**is_monotonic** Returns True if each element is greater than or equal to the previous element

**is_unique** Returns True if the Index has no duplicate values

**unique()** Compute the array of unique values in the Index


 #### Reindexing

 An important method on pandas objects is reindex, which means to create a new
object with the values rearranged to align with the new index

In [38]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])

print(obj)

# Calling reindex on this Series rearranges the data according to the new index,
# introducing missing values if any index values were not already present:

obj2 = obj.reindex(["a", "b", "c", "d", "e"])

print(obj2)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [44]:
# For ordered data like time series, you may want to do some interpolation or filling of
# values when reindexing. The method option allows us to do this, using a method such
# as ffill, which forward-fills the values

obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])

print(obj3)

obj3.reindex(np.arange(6), method="ffill")

print(obj3)

# reindex can alter the (row) index, columns, or both. When passed
# only a sequence, it reindexes the rows in the result

frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "c", "d"], columns=["Ohio", "Texas", "California"])

print(frame)

frame2 = frame.reindex(index = ['a', 'b', 'c', 'd'])

print(frame2)

# The columns can be reindexed with the columns keyword

states = ["Texas", "Utah", "California"]

print(frame.reindex(columns=states))

print('Because "Ohio" was not in states, the data for that column is dropped from the result.')

# Another way to reindex a particular axis is to pass the new axis labels as a positional
# argument and then specify the axis to reindex with the axis keyword

print(frame.reindex(states, axis="columns"))

0      blue
2    purple
4    yellow
dtype: object
0      blue
2    purple
4    yellow
dtype: object
   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8
Because "Ohio" was not in states, the data for that column is dropped from the result.
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


#### reindex function arguments

* **labels** New sequence to use as an index. Can be Index instance or any other sequence-like Python data structure.
An Index will be used exactly as is without any copying.

* **index** Use the passed sequence as the new index labels.

* **columns** Use the passed sequence as the new column labels.

* **axis** The axis to reindex, whether "index" (rows) or "columns". The default is "index". You can
alternately do reindex(index=new_labels) or reindex(columns=new_labels).

* **method** Interpolation (fill) method; "ffill" fills forward, while "bfill" fills backward.

* **fill_value** Substitute value to use when introducing missing data by reindexing. Use fill_value="missing"
(the default behavior) when you want absent labels to have null values in the result.

* **limit** When forward filling or backfilling, the maximum size gap (in number of elements) to fill.

* **tolerance** When forward filling or backfilling, the maximum size gap (in absolute numeric distance) to fill for inexact
matches.

* **level** Match simple Index on level of MultiIndex; otherwise select subset of.

* **copy** If True, always copy underlying data even if the new index is equivalent to the old index; if False, do not
copy the data when the indexes are equivalent


#### Dropping Entries from an Axis

Dropping one or more entries from an axis is simple if you already have an index array or list without those entries, since you can use the reindex method or .loc based indexing

In [47]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])

print(obj)

new_obj = obj.drop("c")

print(new_obj)

print(obj.drop(["d", "c"]))

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
e    4.0
dtype: float64


In [54]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), 
                    index=["Ohio", "Colorado", "Utah", "New York"], 
                    columns=["one", "two", "three", "four"])

print(data)

print(data.drop(index=["Colorado", "Ohio"]))

print(data)

# To drop labels from the columns, instead use the columns keyword

print(data.drop(columns=["two"]))

# You can also drop values from the columns by passing axis=1 (which is like NumPy)
# or axis="columns"

print(data.drop("two", axis=1))

print(data.drop(["two", "four"], axis="columns"))

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14
