In [2]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
print('Pandas version: ', pd.__version__)
print('Numpy version: ', np.__version__)

Pandas version:  2.2.0
Numpy version:  1.26.4


In [4]:
# Series ; A 1-D array with multi type data
var = pd.Series([1,2,3,4,5])
print(var)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [5]:
# If no labelled are specified for the value in our array, then the values are indexed starting from 0
# If we wish to add custom labels to our array, we may continue as:
var = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'] )
print(var)
# Printing values from a specific index can be done either by specifing the label we declared, or by using the default integer indexs
print(var[2])
print(var['a'])

a    1
b    2
c    3
d    4
e    5
dtype: int64
3
1


  print(var[2])


In [6]:
# We can convert a dictionnary into a pandas.Series in such a way that the keys of the ditionnary becomes the labels for the value.
my_dict = {'day1':234, 'day2': 500, 'day3':723}
var = pd.Series(my_dict)
print(var)

day1    234
day2    500
day3    723
dtype: int64


In [7]:
# Dataframes ; These are basically multi dimensionnal sets of Data.
# Let us define a simple dictionnary to convert into a dataframe.
my_data = {
    'calories' : [420, 380, 390],
    'duration' : [50, 40, 45]
    }

# we usually name the dataframes as 'df'
df = pd.DataFrame(my_data)
print(df)


   calories  duration
0       420        50
1       380        40
2       390        45


In [8]:
# if we wish to output a specific row, we can use the .loc() function

print(df.loc[1]) # This works also with labeled data, df.loc['my_label']
print()
print(df.loc[1:2]) 

calories    380
duration     40
Name: 1, dtype: int64

   calories  duration
1       380        40
2       390        45


In [9]:
# To read/import external data into the file, we can use the read methods, depending on the type of file we intend to read.

df = pd.read_csv('Dataset/bank-full.csv')
print(df.to_string()) # if the dataframe is too large, pandas will only return the first and last 5 rows

       age            job   marital  education default  balance housing loan    contact  day month  duration  campaign  pdays  previous poutcome    y
0       58     management   married   tertiary      no     2143     yes   no    unknown    5   may       261         1     -1         0  unknown   no
1       44     technician    single  secondary      no       29     yes   no    unknown    5   may       151         1     -1         0  unknown   no
2       33   entrepreneur   married  secondary      no        2     yes  yes    unknown    5   may        76         1     -1         0  unknown   no
3       47    blue-collar   married    unknown      no     1506     yes   no    unknown    5   may        92         1     -1         0  unknown   no
4       33        unknown    single    unknown      no        1      no   no    unknown    5   may       198         1     -1         0  unknown   no
5       35     management   married   tertiary      no      231     yes   no    unknown    5   may  

In [10]:
print(pd.options.display.max_rows) # Displaying the maximum number of rows we can possibly output in our system, this value is changeable
    

60


In [11]:
df = pd.read_json("Dataset/data.json") # json file have the same structure as python dictionnary
print(df.to_string()) 

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [12]:
# To print the first or the last 5 rows, we can use the tail and head function
print(df.head())
print()
print(df.tail())

# To get more info about the dataframe, we can use the info() function
print(df.info())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0

     Duration  Pulse  Maxpulse  Calories
164        60    105       140     290.8
165        60    110       145     300.4
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4
<class 'pandas.core.frame.DataFrame'>
Index: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 6.6 KB
None


# Data cleaning / Preprocessing :
In each dataset we may work on, we may need to take some actions regarding some wrong formated entries, or to handle some null/non existant values.
In this chapter, we'll try to explain how we can fix those issues.

In [13]:
df = pd.read_csv("Dataset/data.csv")
print(df.to_string())

    Duration          Date  Pulse  Maxpulse  Calories
0         60  '2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45  '2020/12/05'    117       148     406.0
5         60  '2020/12/06'    102       127     300.0
6         60  '2020/12/07'    110       136     374.0
7        450  '2020/12/08'    104       134     253.3
8         30  '2020/12/09'    109       133     195.1
9         60  '2020/12/10'     98       124     269.0
10        60  '2020/12/11'    103       147     329.3
11        60  '2020/12/12'    100       120     250.7
12        60  '2020/12/12'    100       120     250.7
13        60  '2020/12/13'    106       128     345.3
14        60  '2020/12/14'    104       132     379.3
15        60  '2020/12/15'     98       123     275.0
16        60  '2020/12/16'     98       120     215.2
17        60  '2020/12/17'  

In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB
None


Here we notice some null values for 'data' and 'Calories' columns, as well as some date that are wrongly formatted in the dataset. 

In [15]:
# Dropping/deleting all null containing rows
#df.dropna(inplace = True) 
new_df = df.dropna()
print(new_df.info(),"\n")
print(new_df.to_string())

# Filling the null value with synthic values
new_df = df.fillna(130)
print(new_df.info())
print(new_df.to_string())

<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  29 non-null     int64  
 1   Date      29 non-null     object 
 2   Pulse     29 non-null     int64  
 3   Maxpulse  29 non-null     int64  
 4   Calories  29 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB
None 

    Duration          Date  Pulse  Maxpulse  Calories
0         60  '2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45  '2020/12/05'    117       148     406.0
5         60  '2020/12/06'    102       127     300.0
6         60  '2020/12/07'    110       136     374.0
7        450  '2020/12/08'    104       134     253.3
8         30  '2020/12/09'    109       133     195.1
9         60  '2020/1

You may have noticed that when filling the null value in the dataframe, we gave the value '130' to ALL the null value across all columns in our dataframe.
That's why we need to be more specific when it comes to our traitements.

In [16]:
new_df = df.fillna({'Calories' : 130}, inplace=False)
print(new_df.info())
print(new_df.to_string())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  32 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB
None
    Duration          Date  Pulse  Maxpulse  Calories
0         60  '2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45  '2020/12/05'    117       148     406.0
5         60  '2020/12/06'    102       127     300.0
6         60  '2020/12/07'    110       136     374.0
7        450  '2020/12/08'    104       134     253.3
8         30  '2020/12/09'    109       133     195.1
9         60  '202

Sometimes, it's better to fill in the missing values with some calculated one, like the mean of the distribution, or the median, or even the mode. This way we can ensure a more 'natural' and fitting values that will not disturb and alter our models.

In [21]:
df_mode = df['Calories'].mode() ; print('the mode of the distribution is : ', df_mode)
df_median = df['Calories'].median() ; print('the median of the distribution is : ', df_median)
df_mean = df['Calories'].mean() ; print('the mean of the distribution is : ', df_mean)

the mode of the distribution is :  0    300.0
Name: Calories, dtype: float64
the median of the distribution is :  291.2
the mean of the distribution is :  304.68


In [34]:
# Now we can use those value to fill in the missing entries
df.fillna({'Calories' : df_median}, inplace = True)
print(df.to_string())

    Duration       Date  Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7         45 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     300.0
18        45 2020-12-18     90       112     291.2
19        60 2020-12-19    103 

For what concerns the datetime columns, we notice two problems:
    - One row contains the NaN value.
    - The other contains a date entry but it's wrongly formated 
Luckily, we can easily fix this:

In [35]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')

print(df.to_string())

# Note that the values NaN will returned as Nat ( Not a time ), the only solution in this case is either to replace with a specific value or to delete the entire row.
df.dropna(subset=['Date'], inplace = True)

print('\n\n',df.to_string())

    Duration       Date  Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7         45 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     300.0
18        45 2020-12-18     90       112     291.2
19        60 2020-12-19    103 

### Handling typos and wrong data
Sometimes, the data may just be wrong, and we may need to replace it. Here's how :

In [29]:
df.loc[7, 'Duration'] = 45
print(df.head(10))

   Duration       Date  Pulse  Maxpulse  Calories
0        60 2020-12-01    110       130     409.1
1        60 2020-12-02    117       145     479.0
2        60 2020-12-03    103       135     340.0
3        45 2020-12-04    109       175     282.4
4        45 2020-12-05    117       148     406.0
5        60 2020-12-06    102       127     300.0
6        60 2020-12-07    110       136     374.0
7        45 2020-12-08    104       134     253.3
8        30 2020-12-09    109       133     195.1
9        60 2020-12-10     98       124     269.0


In [30]:
# Finding duplicate is also easily done with Pandas :
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool

In [36]:
df.drop_duplicates(inplace=True)
print(df.head(20))

    Duration       Date  Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7         45 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     300.0
18        45 2020-12-18     90       112     291.2
19        60 2020-12-19    103 

## Understanding Correlations with Pandas

Correlation analysis is a fundamental aspect of statistical analysis and data science. It helps us understand the relationship between variables in a dataset. In the context of data analysis using Python, the Pandas library provides powerful tools for computing and visualizing correlations.

### Theoretical Aspect of Correlation

Correlation refers to the degree of association or relationship between two variables. In statistics, correlation is typically quantified using correlation coefficients, such as Pearson correlation coefficient, Spearman rank correlation coefficient, and Kendall tau rank correlation coefficient.

- **Pearson Correlation Coefficient (r)**: Measures the linear relationship between two continuous variables. It ranges from -1 to 1, where:
  - r = 1 indicates a perfect positive linear relationship.
  - r = -1 indicates a perfect negative linear relationship.
  - r = 0 indicates no linear relationship.

- **Spearman Rank Correlation Coefficient**: Measures the strength and direction of association between two ranked variables. It is based on the ranks of the data values rather than their actual numerical values. Spearman correlation is more robust to outliers and does not assume linearity.

- **Kendall Tau Rank Correlation Coefficient**: Similar to Spearman correlation, Kendall tau correlation measures the strength and direction of association between two ranked variables. It also operates on the ranks of the data values and is particularly useful for detecting non-linear relationships and ties in the data.

### Computing Correlations with Pandas

Pandas provides the `corr()` function, which computes pairwise correlation of columns, excluding NA/null values. It supports different correlation methods, including Pearson, Spearman, and Kendall. Here's a basic example of how to compute correlations using Pandas:


In [37]:
# Compute Pearson correlation matrix
pearson_corr = df.corr(method='pearson')

# Compute Spearman correlation matrix
spearman_corr = df.corr(method='spearman')

# Compute Kendall correlation matrix
kendall_corr = df.corr(method='kendall')

# Display correlation matrices
print("Pearson Correlation Matrix:")
print(pearson_corr)
print("\nSpearman Correlation Matrix:")
print(spearman_corr)
print("\nKendall Correlation Matrix:")
print(kendall_corr)


Pearson Correlation Matrix:
          Duration      Date     Pulse  Maxpulse  Calories
Duration  1.000000  0.212084 -0.083417 -0.296585  0.343948
Date      0.212084  1.000000 -0.369328 -0.517827 -0.368510
Pulse    -0.083417 -0.369328  1.000000  0.261426  0.490624
Maxpulse -0.296585 -0.517827  0.261426  1.000000  0.336625
Calories  0.343948 -0.368510  0.490624  0.336625  1.000000

Spearman Correlation Matrix:
          Duration      Date     Pulse  Maxpulse  Calories
Duration  1.000000  0.202173 -0.146654 -0.297003  0.315709
Date      0.202173  1.000000 -0.498608 -0.538969 -0.359306
Pulse    -0.146654 -0.498608  1.000000  0.643129  0.578262
Maxpulse -0.297003 -0.538969  0.643129  1.000000  0.427201
Calories  0.315709 -0.359306  0.578262  0.427201  1.000000

Kendall Correlation Matrix:
          Duration      Date     Pulse  Maxpulse  Calories
Duration  1.000000  0.159539 -0.121995 -0.241082  0.264951
Date      0.159539  1.000000 -0.373219 -0.387442 -0.249715
Pulse    -0.121995 -0.373219