# What is Pandas

Pandas is a Python library used for working with data sets.

It has functions for analyzing, cleaning, exploring, and manipulating data.

The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.

# Why Use Pandas?

Pandas allows us to analyze big data and make conclusions based on statistical theories.

Pandas can clean messy data sets, and make them readable and relevant.

Relevant data is very important in data science.

In [2]:
import pandas as pd

In [10]:
# Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

data = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}
df = pd.DataFrame(data)
print(df)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


# Locate Row (loc and iloc)

In [63]:
# loc is primarily label based
# iloc is primarily integer position based

# iloc is used for selecting rows and columns by integer index
# loc is used for selecting rows and columns by label

data = {"calories": [420, 380, 390], "duration": [50, 40, 45]}
df = pd.DataFrame(data)
print(df, "\n")

print("\nReturn row 0       :\n", df.loc[0])
print("\nReturn row 0 and 1 :\n", df.loc[[0, 1]])

print("\nReturn row 1 to 2 :\n", df.loc[1:2])  # 1 to 2 inclusive
print("\nReturn row 1 to 2 :\n", df.iloc[1:2])  # 1 to 2 exclusive

print("\nReturn all rows with 2nd column values:\n", df.iloc[:,1])
print("\nReturn 1st rows 2nd column value:", df.iloc[0, 1])
print("Return 2nd rows 1st column value:", df.iloc[1, 0])

   calories  duration
0       420        50
1       380        40
2       390        45 


Return row 0       :
 calories    420
duration     50
Name: 0, dtype: int64

Return row 0 and 1 :
    calories  duration
0       420        50
1       380        40

Return row 1 to 2 :
    calories  duration
1       380        40
2       390        45

Return row 1 to 2 :
    calories  duration
1       380        40

Return all rows with 2nd column values:
 0    50
1    40
2    45
Name: duration, dtype: int64

Return 1st rows 2nd column value: 50
Return 2nd rows 1st column value: 380


In [45]:
# Add a list of names to give each row a name:

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
print(df)

print("\nReturn row day1          :\n", df.loc['day1'])
print("\nReturn row day1 and day2 :\n", df.loc[['day1', 'day2']])

      calories  duration
day1       420        50
day2       380        40
day3       390        45

Return row day1          :
 calories    420
duration     50
Name: day1, dtype: int64

Return row day1 and day2 :
       calories  duration
day1       420        50
day2       380        40


# Reading from files

In [37]:
df = pd.read_csv("data.csv")
print(df)

# If you have a large DataFrame with many rows,
# Pandas will only return the first 5 rows, and the last 5 rows.

# To print whole table we use df.to_string()
# print(df.to_string())

# Or you can change the settings to show large rows
# pd.options.display.max_rows = 9999
# print(df)

df = pd.read_json("data.json")
print(df)

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

# head and tail methods

In [42]:
df = pd.read_csv("data.csv")
print(df.head())   # default  1st five
print(df.head(10)) # lets say 1st ten

print(df.tail())   # default  last five
print(df.tail(10)) # lets say last ten

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0
   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0
5        60    102       127     300.0
6        60    110       136     374.0
7        45    104       134     253.3
8        30    109       133     195.1
9        60     98       124     269.0
     Duration  Pulse  Maxpulse  Calories
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4
     Duration  Pulse  Maxpulse  Calories
159        30     80       120     240.9
160      

In [65]:
df = pd.read_json('data.json')
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60.0,2020/12/01,110,130,409.1
1,60.0,2020-12-02,117,145,479.0
2,60.0,2020/12/03,one hundred three,135,340.0
3,,2020/12/04,109,175,282.4
4,45.0,2020/12/05,117,148,406.0
5,60.0,12/06/2020,102,one twenty-seven,300.0
6,60.0,2020/12/07,110,136,
7,450.0,2020/12/08,104,134,253.3
8,-30.0,2020/12/09,109,133,195.1
9,60.0,12-10-2020,98,124,two sixty-nine


In [None]:
# Convert 'Duration' column to numeric, coercing errors to NaN
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

# Find all rows where 'Duration' is greater than 40
rows = df.loc[df['Duration'] > 40]
# print(rows)

# Find all rows where 'Duration' is equal to 60
rows = df.loc[df['Duration'] == 60]
# print(rows)

# Find all Maxpulse where 'Duration' is equal to 60
rows = df.loc[df['Duration'] == 60, 'Maxpulse']
# print(rows)

# Find all Maxpulse and Pulse where 'Duration' is equal to 60
rows = df.loc[df['Duration'] == 60, ['Pulse', 'Maxpulse']]
print(rows)

                Pulse          Maxpulse
0                 110               130
1                 117               145
2   one hundred three               135
5                 102  one twenty-seven
6                 110               136
9                  98               124
10                103               147
11                100               120
12                100               120
16               -100               120
17                100               300
19                 92               118
20                103               132
