## INTRO TO PANDAS AND NUMPY FOR DATA ANALYSIS

In [1]:
# import the needed libraries

import pandas as pd
import numpy as np

In [3]:
# check the version of the libraries
print(np.__version__)
print(pd.__version__)

1.26.2
2.1.4


### creating dataframe from scratch

In [5]:
# creating dataframe from scratch using list of lists

# create the data list
students_list = [
    ['tola',23,'f','single'],
    ['femi',15,'m', 'married'],
    ['sodiq',35,'f','divorced'],
    ['wale',10,'m','single']
]

# create a list of columns names
column_name = ['name','age','gender','marital_status']

# turn the data list to a dataframe
student_df = pd.DataFrame(data= students_list, columns = column_name)
student_df


Unnamed: 0,name,age,gender,marital_status
0,tola,23,f,single
1,femi,15,m,married
2,sodiq,35,f,divorced
3,wale,10,m,single


In [6]:
# creating dataframe from dictionaries

# create a dictionary

club_dict = {
    'club_name': ['utd','chelsea','city','arsenal','spurs'],
    'ucl': [3,2,1,0,0],
    'best_player': ['bruno','palmer','Bernado','Saliba','Son']
}

club_df = pd.DataFrame(data=club_dict)
club_df

Unnamed: 0,club_name,ucl,best_player
0,utd,3,bruno
1,chelsea,2,palmer
2,city,1,Bernado
3,arsenal,0,Saliba
4,spurs,0,Son


In [None]:
# exercise:
# create a dataframe of countries, capital and continent.


### import existing csv files

In [7]:
insurance_df = pd.read_csv('insurance.csv')

#### Pandas Methods and attributes for data wrangling

In [11]:
# .head method
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
# .tail method
insurance_df.tail(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [26]:
# .shape attribute
insurance_df.shape

(1338, 7)

In [27]:
# .info method

insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
# .isnull() with sum() methods for checking null values (frequency)

insurance_df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
# checking for missing values using isnull.() and .any()
insurance_df.isnull().any()

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool

In [None]:
# creating new colums with static values

insurance_df['dumb_column'] = 4
insurance_df.head(4)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,dumb_column
0,19,female,27.9,0,yes,southwest,16884.924,4
1,18,male,33.77,1,no,southeast,1725.5523,4
2,28,male,33.0,3,no,southeast,4449.462,4
3,33,male,22.705,0,no,northwest,21984.47061,4


In [35]:
# creating new columns with basic arithmetic

insurance_df['tax'] = insurance_df['charges'] * 0.03
insurance_df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,dumb_column,tax
0,19,female,27.9,0,yes,southwest,16884.924,4,506.54772
1,18,male,33.77,1,no,southeast,1725.5523,4,51.766569
2,28,male,33.0,3,no,southeast,4449.462,4,133.48386


In [None]:
# create a new column based on conditions
insurance_df['age_group'] = ['teenager' if x < 20
                             else 'adult' if x < 60
                             else 'old' for x in insurance_df['age']]
insurance_df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,dumb_column,tax,age_group
0,19,female,27.9,0,yes,southwest,16884.924,4,506.54772,teenager
1,18,male,33.77,1,no,southeast,1725.5523,4,51.766569,teenager
2,28,male,33.0,3,no,southeast,4449.462,4,133.48386,adult


In [None]:
# create a column based on multiple condition using .apply()

def young_smoker(data):
    if data['age'] < 20 and data['smoker'] == 'yes':
        return 'yes'
    else:
        return 'No'
    
insurance_df['young_smoker'] = insurance_df.apply(young_smoker, axis = 1)

In [43]:
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,dumb_column,tax,age_group,young_smoker
0,19,female,27.9,0,yes,southwest,16884.924,4,506.54772,teenager,yes
1,18,male,33.77,1,no,southeast,1725.5523,4,51.766569,teenager,No
2,28,male,33.0,3,no,southeast,4449.462,4,133.48386,adult,No
3,33,male,22.705,0,no,northwest,21984.47061,4,659.534118,adult,No
4,32,male,28.88,0,no,northwest,3866.8552,4,116.005656,adult,No


In [47]:
# droping rows or columns using .drop method

insurance_df.drop(['tax','dumb_column'], axis = 1, inplace = True)


In [50]:
# drop rows 

insurance_df.drop(list(range(0,101)), inplace = True)

In [55]:
# reset_index method 

insurance_df.reset_index(drop = True, inplace = True)
insurance_df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_group,young_smoker
0,30,male,25.46,0,no,northeast,3645.0894,adult,No
1,18,female,30.115,0,no,northeast,21344.8467,teenager,No
2,61,female,29.92,3,yes,southeast,30942.1918,old,No


In [56]:
# how to slice a dataframe

insurance_df[['age','bmi','region']]

Unnamed: 0,age,bmi,region
0,30,25.460,northeast
1,18,30.115,northeast
2,61,29.920,southeast
3,34,27.500,southwest
4,20,28.025,northwest
...,...,...,...
1232,50,30.970,northwest
1233,18,31.920,northeast
1234,18,36.850,southeast
1235,21,25.800,southwest


In [60]:
# groupby method for pivoting

insurance_df[['smoker','bmi']].groupby(by='smoker').count()


Unnamed: 0_level_0,bmi
smoker,Unnamed: 1_level_1
no,990
yes,247


In [61]:
# row filtering based on conditions

insurance_df[insurance_df['sex'] == 'male']

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_group,young_smoker
0,30,male,25.460,0,no,northeast,3645.08940,adult,No
4,20,male,28.025,1,yes,northwest,17560.37975,adult,No
6,26,male,30.875,2,no,northwest,3877.30425,adult,No
7,29,male,27.940,0,no,southeast,2867.11960,adult,No
8,63,male,35.090,0,yes,southeast,47055.53210,old,No
...,...,...,...,...,...,...,...,...,...
1223,31,male,25.935,1,no,northwest,4239.89265,adult,No
1224,61,male,33.535,0,no,northeast,13143.33665,old,No
1226,51,male,30.030,1,no,southeast,9377.90470,adult,No
1228,52,male,38.600,2,no,southwest,10325.20600,adult,No


In [66]:
# row filtering based on multiple conditions

insurance_df[(insurance_df['sex'] == 'male') & 
             (insurance_df['region'] == 'northeast') &
             (insurance_df['smoker'] == 'yes')]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_group,young_smoker
22,44,male,31.35,1,yes,northeast,39556.4945,adult,No
56,18,male,25.175,0,yes,northeast,15518.18025,teenager,yes
84,36,male,41.895,3,yes,northeast,43753.33705,adult,No
106,35,male,27.74,2,yes,northeast,20984.0936,adult,No
149,18,male,17.29,2,yes,northeast,12829.4551,teenager,yes
153,50,male,31.825,0,yes,northeast,41097.16175,adult,No
161,52,male,24.32,3,yes,northeast,24869.8368,adult,No
180,54,male,40.565,3,yes,northeast,48549.17835,adult,No
237,50,male,32.3,1,yes,northeast,41919.097,adult,No
280,55,male,30.685,0,yes,northeast,42303.69215,adult,No


In [67]:
# .describe method

insurance_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1237.0,1237.0,1237.0,1237.0
mean,39.235247,30.647753,1.097817,13169.641634
std,13.990361,6.131709,1.203139,11997.23126
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.22,0.0,4753.6368
50%,40.0,30.305,1.0,9361.3268
75%,51.0,34.5,2.0,16450.8947
max,64.0,53.13,5.0,63770.42801


In [75]:
# .value_counts method

insurance_df['smoker'].value_counts(ascending = True)

smoker
yes    247
no     990
Name: count, dtype: int64

In [79]:
temp_df = insurance_df['smoker'].value_counts(ascending = True)
temp_df = pd.DataFrame(temp_df)

In [85]:
temp_df['percentage'] = [f'{round(x/temp_df['count'].sum() * 100, 3)}%'
                         for x in temp_df['count']]
temp_df

Unnamed: 0_level_0,count,percentage
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
yes,247,19.968%
no,990,80.032%


In [86]:
# .unique method to check the number of unique values

insurance_df['region'].unique()

array(['northeast', 'southeast', 'southwest', 'northwest'], dtype=object)

In [None]:
# .nunique for counting number of unique elements
insurance_df['region'].nunique()

4

##### Assignment

- Apply all the methods covered in class on any dataset of your choice
- Go to pandas documentation and implement any 5 new method on your
dataset
- Write a short medium article on those new methods you just found out. 