## Pandas Intro

In [1]:
import pandas as pd
import numpy as np

In [None]:
# create our dataframe

# from dictionary

students = {
    'name': ['fola','titi','folu','kemi'],
    'age': [12,23,54,43],
    'gender': ['f','m','m','f']
}

# turn the dictionary to a dataframe
students_df = pd.DataFrame(data=students)
students_df

In [4]:
# create a dataframe from a list of lists

clubs = [
    ['man utd', 3, 'bruno'],
    ['chelsea', 2, 'palmer'],
    ['arsenal', 0, 'saka'],
    ['barca', 5, 'yamal']
]

column_names = ['club_name','no_of_ucl','best_player']

club_df = pd.DataFrame(data=clubs, columns=column_names)
club_df

Unnamed: 0,club_name,no_of_ucl,best_player
0,man utd,3,bruno
1,chelsea,2,palmer
2,arsenal,0,saka
3,barca,5,yamal


## PANDAS CONTINUED

In [3]:
import pandas as pd
import numpy as np

In [4]:
# load a csv or excel file xls

insurance_df = pd.read_csv('insurance.csv')
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### DATA WRANGLING

In [8]:
# .head() method

insurance_df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
# .tail method 

insurance_df.tail(7)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1331,23,female,33.4,0,no,southwest,10795.93733
1332,52,female,44.7,3,no,southwest,11411.685
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [20]:
# access individual column. # key reference

insurance_df['sex']

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: sex, Length: 1338, dtype: object

In [15]:
# access multiple columns. dataframe slicing


insurance_df[['sex', 'smoker','region']]

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest
...,...,...,...
1333,male,no,northwest
1334,female,no,northeast
1335,female,no,southeast
1336,female,no,southwest


In [21]:
# create new columns

# based on arithmetic operation

insurance_df['year of birth'] = 2025 - insurance_df['age']
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,year of birth
0,19,female,27.9,0,yes,southwest,16884.924,2006
1,18,male,33.77,1,no,southeast,1725.5523,2007
2,28,male,33.0,3,no,southeast,4449.462,1997
3,33,male,22.705,0,no,northwest,21984.47061,1992
4,32,male,28.88,0,no,northwest,3866.8552,1993


In [31]:
# check basic description of data with .describe()

insurance_df.describe()

Unnamed: 0,age,bmi,children,charges,year of birth
count,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265,1985.792975
std,14.04996,6.098187,1.205493,12110.011237,14.04996
min,18.0,15.96,0.0,1121.8739,1961.0
25%,27.0,26.29625,0.0,4740.28715,1974.0
50%,39.0,30.4,1.0,9382.033,1986.0
75%,51.0,34.69375,2.0,16639.912515,1998.0
max,64.0,53.13,5.0,63770.42801,2007.0


In [32]:
## conditionals - using list comprehension

insurance_df['age_group'] = ['teenager' if age < 20 else 'adult' if age < 60
                             else 'old' for age in insurance_df['age']]

insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,year of birth,age_group
0,19,female,27.9,0,yes,southwest,16884.924,2006,teenager
1,18,male,33.77,1,no,southeast,1725.5523,2007,teenager
2,28,male,33.0,3,no,southeast,4449.462,1997,adult
3,33,male,22.705,0,no,northwest,21984.47061,1992,adult
4,32,male,28.88,0,no,northwest,3866.8552,1993,adult


In [33]:
# how to filter our dataframe based on condtions

insurance_df[insurance_df['age_group'] == 'old']

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,year of birth,age_group
9,60,female,25.840,0,no,northwest,28923.13692,1965,old
11,62,female,26.290,0,yes,southeast,27808.72510,1963,old
20,60,female,36.005,0,no,northeast,13228.84695,1965,old
26,63,female,23.085,0,no,northeast,14451.83515,1962,old
33,63,male,28.310,0,no,northwest,13770.09790,1962,old
...,...,...,...,...,...,...,...,...,...
1301,62,male,30.875,3,yes,northwest,46718.16325,1963,old
1321,62,male,26.695,0,yes,northeast,28101.33305,1963,old
1322,62,male,38.830,0,no,southeast,12981.34570,1963,old
1325,61,male,33.535,0,no,northeast,13143.33665,1964,old


In [35]:
# multiple condition

insurance_df[(insurance_df['age_group'] == 'old') | (insurance_df['smoker'] == "yes")]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,year of birth,age_group
0,19,female,27.900,0,yes,southwest,16884.92400,2006,teenager
9,60,female,25.840,0,no,northwest,28923.13692,1965,old
11,62,female,26.290,0,yes,southeast,27808.72510,1963,old
14,27,male,42.130,0,yes,southeast,39611.75770,1998,adult
19,30,male,35.300,0,yes,southwest,36837.46700,1995,adult
...,...,...,...,...,...,...,...,...,...
1321,62,male,26.695,0,yes,northeast,28101.33305,1963,old
1322,62,male,38.830,0,no,southeast,12981.34570,1963,old
1323,42,female,40.370,2,yes,southeast,43896.37630,1983,adult
1325,61,male,33.535,0,no,northeast,13143.33665,1964,old


In [38]:
# how to drop columns and rows with .drop method

# columns
insurance_df.drop(columns=['age_group'], inplace=True)

In [40]:
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,year of birth
0,19,female,27.900,0,yes,southwest,16884.92400,2006
1,18,male,33.770,1,no,southeast,1725.55230,2007
2,28,male,33.000,3,no,southeast,4449.46200,1997
3,33,male,22.705,0,no,northwest,21984.47061,1992
4,32,male,28.880,0,no,northwest,3866.85520,1993
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,1975
1334,18,female,31.920,0,no,northeast,2205.98080,2007
1335,18,female,36.850,0,no,southeast,1629.83350,2007
1336,21,female,25.800,0,no,southwest,2007.94500,2004


In [42]:
# drop rows 

insurance_df.drop(list(range(100)))

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,year of birth
100,41,female,31.600,0,no,southwest,6186.1270,1984
101,30,male,25.460,0,no,northeast,3645.0894,1995
102,18,female,30.115,0,no,northeast,21344.8467,2007
103,61,female,29.920,3,yes,southeast,30942.1918,1964
104,34,female,27.500,1,no,southwest,5003.8530,1991
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.5483,1975
1334,18,female,31.920,0,no,northeast,2205.9808,2007
1335,18,female,36.850,0,no,southeast,1629.8335,2007
1336,21,female,25.800,0,no,southwest,2007.9450,2004


In [None]:
import plotly.express as px
import pandas as pd
import numpy as np



restaurant_df = px.data.tips()
restaurant_df.head()

In [None]:
# 1. inspect the head of the data
# 2. inspect the tail of the data
# 3. create a new column called final payment which is total bill + tip
# 4. drop the column named time.

In [29]:
numbers = [0,1,2,3,4,5]

bucket = []
for number in numbers:
    if number == 0:
        bucket.append('zero')
    elif number % 2 == 0:
        bucket.append('even')
    else:
        bucket.append('odd')

bucket

['zero', 'odd', 'even', 'odd', 'even', 'odd']

In [26]:
[number for number in numbers if number % 2 == 0]

[2, 4]

In [30]:
['zero' if number == 0 else 'even' if number % 2 == 0 else 'odd' for number in numbers]

['zero', 'odd', 'even', 'odd', 'even', 'odd']