# Pandas

## Series

It is a one-dimensional array holding data of any type.

In [None]:
import pandas as pd

a = [1, 7, 2]
print(pd.Series(a)) # with default labels/indexs

a = [1, 7, 2]
print(pd.Series(a, index = ["x", "y", "z"])) # with given labels/indexs

calories = {"day1": 420, "day2": 380, "day3": 390}
print(pd.Series(calories)) # transfer dict to series

## DataFrames

A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [5]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data)
print(df) # with default labels/indexs

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
print(df) # with given labels/indexs

   calories  duration
0       420        50
1       380        40
2       390        45
      calories  duration
day1       420        50
day2       380        40
day3       390        45


### Input file as DataFrame

In [9]:
import pandas as pd

df = pd.read_csv('data.csv') # I didn't put data.csv

print(pd.options.display.max_rows) # Check the number of maximum returned rows
print(df.to_string()) # this will print all rows in the dataset
print(df) 

60
    calories  duration
0        420        50
1        380        40
2        390        45
3        420        50
4        380        40
5        390        45
6        420        50
7        380        40
8        390        45
9        420        50
10       380        40
11       390        45
12       420        50
13       380        40
14       390        45
    calories  duration
0        420        50
1        380        40
2        390        45
3        420        50
4        380        40
5        390        45
6        420        50
7        380        40
8        390        45
9        420        50
10       380        40
11       390        45
12       420        50
13       380        40
14       390        45


### Load a Python Dictionary into a DataFrame

In [10]:
import pandas as pd

data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

print(df) 

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


## Analyzing DataFrames

In [2]:
import pandas as pd

df = pd.read_csv('data.csv')

print(df.head(10))# Print the first 5 rows of the DataFrame:

print(df.tail(5)) # Print the last 5 rows of the DataFrame:

print(df.info()) # Print information about the data

   Duration          Date  Pulse  Maxpulse  Calories
0        60  '2020/12/01'    110       130     409.1
1        60  '2020/12/02'    117       145     479.0
2        60  '2020/12/03'    103       135     340.0
3        45  '2020/12/04'    109       175     282.4
4        45  '2020/12/05'    117       148     406.0
5        60  '2020/12/06'    102       127     300.0
6        60  '2020/12/07'    110       136     374.0
7       450  '2020/12/08'    104       134     253.3
8        30  '2020/12/09'    109       133     195.1
9        60  '2020/12/10'     98       124     269.0
    Duration          Date  Pulse  Maxpulse  Calories
27        60  '2020/12/27'     92       118     241.0
28        60  '2020/12/28'    103       132       NaN
29        60  '2020/12/29'    100       132     280.0
30        60  '2020/12/30'    102       129     380.3
31        60  '2020/12/31'     92       115     243.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 c

## Data Cleaning

### Remove Rows
One way to deal with empty cells is to remove rows that contain empty cells.

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

new_df = df.dropna()
df.dropna(inplace = True) #

print(new_df.to_string())

### Replace Empty Values
Another way of dealing with empty cells is to insert a new value instead.

In [15]:
import pandas as pd

df = pd.read_csv('data.csv')

# checking null value
df.isna().sum()

df["Calories"].fillna(130, inplace = True) # Only replace empty values for one column
df.fillna(130, inplace = True) # Replace NULL values with the number 130

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Calories"].fillna(130, inplace = True) # Only replace empty values for one column


A common way to replace empty cells, is to calculate the **mean**, **median** or **mode** value of the column.

In [6]:
import pandas as pd

df = pd.read_csv('data.csv')

x = df["Calories"].mean()
# x = df["Calories"].median()
# x = df["Calories"].mode()[0]

df["Calories"].fillna(x, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Calories"].fillna(x, inplace = True)


### Wrong Format Handling

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")  # Replace invalid dates with NaT
df.dropna(subset=['Date'], inplace = True)

print(df.to_string())

### Fixing Wrong Data

In [13]:
# it is most likely a typo, and the value should be "45" instead of "450", and we could just insert "45" in row 7
df.loc[7, 'Duration'] = 45 # Set "Duration" = 45 in row 7

# If the value is higher than 120, set it to 120
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.loc[x, "Duration"] = 120
    
# Delete rows where "Duration" is higher than 120
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.drop(x, inplace = True)

### Removing Duplicates

In [16]:
# checking duplicate
df.duplicated().sum()

df.drop_duplicates(inplace = True)