# Pandas

In [2]:
import pandas as pd
pd.__version__

'2.0.0'

## Creating DataFrame

#### Creating Data as a Dictionary

In [2]:
data = pd.DataFrame(
    {
        "Name": [
            "Muzzammil",
            "Awais",
            "Imran",
            ],
        "Age": [20, 19, 51],
        "Sex": ["male", "male", "male"],
    }
)

print(data)

        Name  Age   Sex
0  Muzzammil   20  male
1      Awais   19  male
2      Imran   51  male


#### Creating Json data

In [3]:
data = {
    "Duration": {"0": 60, "1": 60, "2": 60, "3": 45, "4": 45, "5": 60},
    "Pulse": {"0": 110, "1": 117, "2": 103, "3": 109, "4": 117, "5": 102},
    "Maxpulse": {"0": 130, "1": 145, "2": 135, "3": 175, "4": 148, "5": 127},
    "Calories": {"0": 409, "1": 479, "2": 340, "3": 282, "4": 406, "5": 300}
}
df = pd.DataFrame(data)
print(df)

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


## Accessing Data

#### Access by index

In [4]:
data = pd.DataFrame(
    {
        "Name": [
            "Muzzammil",
            "Awais",
            "Imran",
        ],
        "Age": [20, 19, 51],
        "Sex": ["male", "male", "male"],
    }
)

print('Complete\n', data)
print('\nIndex 0:\n', data.loc[0])
print('\nIndex 0, 1:\n', data.loc[[0 , 2]])


Complete
         Name  Age   Sex
0  Muzzammil   20  male
1      Awais   19  male
2      Imran   51  male

Index 0:
 Name    Muzzammil
Age            20
Sex          male
Name: 0, dtype: object

Index 0, 1:
         Name  Age   Sex
0  Muzzammil   20  male
2      Imran   51  male


#### Access by key 

In [5]:
data = pd.DataFrame(
    {
        "Name": [
            "Muzzammil",
            "Awais",
            "Imran",
        ],
        "Age": [20, 19, 51],
        "Sex": ["male", "male", "male"],
    }
)

print(data["Name"])

0    Muzzammil
1        Awais
2        Imran
Name: Name, dtype: object


## Pandas Series

In [8]:
ages = pd.Series([22, 35, 58], name="Age")
print(ages)

0    22
1    35
2    58
Name: Age, dtype: int64


#### Describe

In [11]:
ages = pd.Series([22, 35, 58], name="Age")
print(ages.describe())

count     3.000000
mean     38.333333
std      18.230012
min      22.000000
25%      28.500000
50%      35.000000
75%      46.500000
max      58.000000
Name: Age, dtype: float64


#### Index Labels

In [16]:
a = [1, 7, 2]
myvar = pd.Series(a, index=["x", "y", "z"])
print("Full:\n", myvar)
print("\nIndex y:", myvar["y"])

Full:
 x    1
y    7
z    2
dtype: int64

Index y: 7


## Reading Files

#### Read CSV file

In [33]:
df = pd.read_csv('Data/data.csv')
print(df)

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

#### Read Excel File

In [None]:
df = pd.read_excel('Data/excel_file.xlsx')
print(df)

### Read Json file

In [None]:
df = pd.read_json('Data/data.json')
print(df)

## Data frame Functions

#### Converting to string

In [26]:
df = pd.read_csv('Data/data.csv')
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

#### max_rows to show

In [57]:
df = pd.read_csv('Data/data.csv')
pd.options.display.max_rows = 5
print(pd.options.display.max_rows)
print(df)

5
     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    110       130     409.1
..        ...    ...       ...       ...
168        75    120       150     320.4
169        75    125       150     330.4

[170 rows x 4 columns]


#### Show first 5 rows

In [5]:
df = pd.read_csv('Data/data.csv')
print(df.head())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0


#### Show last 5 rows

In [6]:
df = pd.read_csv('Data/data.csv')
print(df.tail())

     Duration  Pulse  Maxpulse  Calories
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4


#### Show first/last n rows

In [12]:
n = 3
df = pd.read_csv('Data/data.csv')
print(df.head(n))
print(df.tail(n))

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
     Duration  Pulse  Maxpulse  Calories
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4


#### Show information of data

In [13]:
df = pd.read_csv('Data/data.csv')
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB
None


## Data Deleting And Replacing

#### Empty Cells Deleting And Making new data set

In [21]:
df = pd.read_csv('Data/data.csv')
new_df = df.dropna()
print(new_df.to_string())


     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45    105       132     246.0
24         60   

#### Empty Cells Deleting in original data set

In [22]:
df = pd.read_csv('Data/data.csv')
df.dropna(inplace=True)
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45    105       132     246.0
24         60   

#### Replace Only For Specified Columns

In [25]:
df = pd.read_csv('Data/data.csv')
df["Calories"].fillna(130, inplace=True)
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112     130.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

#### Replace Using Mean

In [28]:
df = pd.read_csv('Data/data.csv')
x = df["Calories"].mean()
df["Calories"].fillna(x, inplace=True)
print(df.to_string())

     Duration  Pulse  Maxpulse     Calories
0          60    110       130   409.100000
1          60    117       145   479.000000
2          60    103       135   340.000000
3          45    109       175   282.400000
4          45    117       148   406.000000
5          60    102       127   300.000000
6          60    110       136   374.000000
7          45    104       134   253.300000
8          30    109       133   195.100000
9          60     98       124   269.000000
10         60    103       147   329.300000
11         60    100       120   250.700000
12         60    106       128   345.300000
13         60    104       132   379.300000
14         60     98       123   275.000000
15         60     98       120   215.200000
16         60    100       120   300.000000
17         45     90       112   375.790244
18         60    103       123   323.000000
19         45     97       125   243.000000
20         60    108       131   364.200000
21         45    100       119  

#### Replace Using Median

In [29]:
df = pd.read_csv('Data/data.csv')
x = df["Calories"].median()
df["Calories"].fillna(x, inplace=True)
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112     318.6
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

#### Replace Using Mode

In [31]:
df = pd.read_csv('Data/data.csv')
x = df["Calories"].mode()[0]
df["Calories"].fillna(x, inplace=True)
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112     300.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

#### Removing duplicate

In [44]:
df = pd.read_csv('Data/data.csv')
print(df.duplicated())
print("\n\nAfter del\n", df.drop_duplicates())


0      False
1       True
2      False
3      False
4      False
       ...  
165    False
166    False
167    False
168    False
169    False
Length: 170, dtype: bool


After del
      Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
2          60    117       145     479.0
3          60    103       135     340.0
4          45    109       175     282.4
5          45    117       148     406.0
..        ...    ...       ...       ...
165        60    105       140     290.8
166        60    110       145     300.0
167        60    115       145     310.2
168        75    120       150     320.4
169        75    125       150     330.4

[162 rows x 4 columns]


## Data Format issue

In [49]:
df = pd.read_csv('Data/data.csv')
df['Duration'] = pd.to_datetime(df['Duration'])
print(df.to_string())

                         Duration  Pulse  Maxpulse  Calories
0   1970-01-01 00:00:00.000000060    110       130     409.1
1   1970-01-01 00:00:00.000000060    110       130     409.1
2   1970-01-01 00:00:00.000000060    117       145     479.0
3   1970-01-01 00:00:00.000000060    103       135     340.0
4   1970-01-01 00:00:00.000000045    109       175     282.4
5   1970-01-01 00:00:00.000000045    117       148     406.0
6   1970-01-01 00:00:00.000000060    102       127     300.0
7   1970-01-01 00:00:00.000000060    110       136     374.0
8   1970-01-01 00:00:00.000000045    104       134     253.3
9   1970-01-01 00:00:00.000000030    109       133     195.1
10  1970-01-01 00:00:00.000000060     98       124     269.0
11  1970-01-01 00:00:00.000000060    103       147     329.3
12  1970-01-01 00:00:00.000000060    100       120     250.7
13  1970-01-01 00:00:00.000000060    106       128     345.3
14  1970-01-01 00:00:00.000000060    104       132     379.3
15  1970-01-01 00:00:00.

## Correlation in data 

In [45]:
df = pd.read_csv('Data/data.csv')
print(df.corr())

          Duration     Pulse  Maxpulse  Calories
Duration  1.000000 -0.155484  0.009533  0.922573
Pulse    -0.155484  1.000000  0.786068  0.025239
Maxpulse  0.009533  0.786068  1.000000  0.203566
Calories  0.922573  0.025239  0.203566  1.000000
