### *Importing Libraries and loading Data*

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
smokers_df = pd.read_csv("./smokers.csv")
prescriptions_df = pd.read_csv("./prescriptions.csv")
metrics_df = pd.read_csv("./metrics.csv")
fatalities_df = pd.read_csv("./fatalities.csv")
admissions_df = pd.read_csv("./admissions.csv")

In [3]:
smokers_df.head()

Unnamed: 0,Year,Method,Sex,16 and Over,16-24,25-34,35-49,50-59,60 and Over
0,1974,Unweighted,,46,44,51,52,50,33
1,1976,Unweighted,,42,42,45,48,48,30
2,1978,Unweighted,,40,39,45,45,45,30
3,1980,Unweighted,,39,37,46,44,45,29
4,1982,Unweighted,,35,35,38,39,41,27


In [4]:
smokers_df.shape

(84, 9)

In [5]:
smokers_df.columns

Index(['Year', 'Method', 'Sex', '16 and Over', '16-24', '25-34', '35-49',
       '50-59', '60 and Over'],
      dtype='object')

In [6]:
# Information about dataset
smokers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         84 non-null     int64 
 1   Method       84 non-null     object
 2   Sex          56 non-null     object
 3   16 and Over  84 non-null     int64 
 4   16-24        84 non-null     int64 
 5   25-34        84 non-null     int64 
 6   35-49        84 non-null     int64 
 7   50-59        84 non-null     int64 
 8   60 and Over  84 non-null     int64 
dtypes: int64(7), object(2)
memory usage: 6.0+ KB


### *Exploratory of Data Analysis*

In [7]:
# Summary Statistics of Numerical Columns
smokers_df.describe()

Unnamed: 0,Year,16 and Over,16-24,25-34,35-49,50-59,60 and Over
count,84.0,84.0,84.0,84.0,84.0,84.0,84.0
mean,1997.25,27.892857,31.345238,33.678571,31.285714,28.952381,18.75
std,12.142601,7.525087,5.954636,7.287819,8.161873,9.376855,7.441134
min,1974.0,17.0,20.0,20.0,20.0,18.0,10.0
25%,1987.5,22.0,26.0,28.0,25.0,22.0,13.0
50%,2000.5,27.0,33.0,34.0,29.5,26.0,16.0
75%,2007.25,32.0,35.0,37.0,36.0,34.25,23.0
max,2014.0,51.0,47.0,55.0,55.0,53.0,44.0


In [8]:
# Summary Statistics of categorical Columns
smokers_df.describe(include='object')

Unnamed: 0,Method,Sex
count,84,56
unique,2,2
top,Weighted,Male
freq,45,28


### *Handling Missing Value*

In [9]:
# Check missing value
smokers_df.isnull().sum()

Year            0
Method          0
Sex            28
16 and Over     0
16-24           0
25-34           0
35-49           0
50-59           0
60 and Over     0
dtype: int64

In [10]:
smokers_df.isnull().mean()

Year           0.000000
Method         0.000000
Sex            0.333333
16 and Over    0.000000
16-24          0.000000
25-34          0.000000
35-49          0.000000
50-59          0.000000
60 and Over    0.000000
dtype: float64

In [11]:
# Fill Null Value with Unknown
smokers_df['Sex']= smokers_df['Sex'].fillna("Unknown")

In [12]:
smokers_df['Sex']

0     Unknown
1     Unknown
2     Unknown
3     Unknown
4     Unknown
       ...   
79     Female
80     Female
81     Female
82     Female
83     Female
Name: Sex, Length: 84, dtype: object

In [13]:
# Check missing value
smokers_df.isnull().sum()

Year           0
Method         0
Sex            0
16 and Over    0
16-24          0
25-34          0
35-49          0
50-59          0
60 and Over    0
dtype: int64

In [14]:
smokers_df.isnull().mean()

Year           0.0
Method         0.0
Sex            0.0
16 and Over    0.0
16-24          0.0
25-34          0.0
35-49          0.0
50-59          0.0
60 and Over    0.0
dtype: float64

### *Data Analysis and Visualizations*

In [None]:
# Check Correlation between them
plt.figure(figsize=(15,10))
sns.heatmap(smokers_df.corr(), annot=True)
plt.show()

In [4]:
prescriptions_df.head()

Unnamed: 0,Year,All Pharmacotherapy Prescriptions,Nicotine Replacement Therapy (NRT) Prescriptions,Bupropion (Zyban) Prescriptions,Varenicline (Champix) Prescriptions,Net Ingredient Cost of All Pharmacotherapies,Net Ingredient Cost of Nicotine Replacement Therapies (NRT),Net Ingredient Cost of Bupropion (Zyban),Net Ingredient Cost of Varenicline (Champix)
0,2014/15,1348,766,21,561.0,38145,18208,807,19129.0
1,2013/14,1778,1059,22,697.0,48767,24257,865,23646.0
2,2012/13,2203,1318,26,859.0,58121,28069,994,29058.0
3,2011/12,2532,1545,30,957.0,64552,30951,1216,32385.0
4,2010/11,2564,1541,36,987.0,65883,30808,1581,33494.0


In [9]:
prescriptions_df.shape

(11, 9)

In [5]:
metrics_df.head()

Unnamed: 0,Year,Tobacco Price\nIndex,Retail Prices\nIndex,Tobacco Price Index Relative to Retail Price Index,Real Households' Disposable Income,Affordability of Tobacco Index,Household Expenditure on Tobacco,Household Expenditure Total,Expenditure on Tobacco as a Percentage of Expenditure
0,2015,1294.3,386.7,334.7,196.4,58.7,19252.0,1152387.0,1.7
1,2014,1226.0,383.0,320.1,190.0,59.4,19411.0,1118992.0,1.7
2,2013,1139.3,374.2,304.5,190.3,62.5,18683.0,1073106.0,1.7
3,2012,1057.8,363.1,291.3,192.9,66.2,18702.0,1029378.0,1.8
4,2011,974.9,351.9,277.1,189.3,68.3,18217.0,990828.0,1.8


In [10]:
metrics_df.shape

(36, 9)

In [6]:
fatalities_df.head()

Unnamed: 0,Year,ICD10 Code,ICD10 Diagnosis,Diagnosis Type,Metric,Sex,Value
0,2014,All codes,All deaths,All deaths,Number of observed deaths,,459087
1,2014,C33-C34 & C00-C14 & C15 & C32 & C53 & C67 & C6...,All deaths which can be caused by smoking,All deaths which can be caused by smoking,Number of observed deaths,,235820
2,2014,C00-D48,All cancers,All cancers,Number of observed deaths,,136312
3,2014,J00-J99,All respiratory diseases,All respiratory diseases,Number of observed deaths,,61744
4,2014,I00-I99,All circulatory diseases,All circulatory diseases,Number of observed deaths,,126101


In [13]:
fatalities_df.shape

(1749, 7)

In [7]:
admissions_df.head()

Unnamed: 0,Year,ICD10 Code,ICD10 Diagnosis,Diagnosis Type,Metric,Sex,Value
0,2014/15,All codes,All admissions,All admissions,Number of admissions,,11011882
1,2014/15,C33-C34 & C00-C14 & C15 & C32 & C53 & C67 & C6...,All diseases which can be caused by smoking,All diseases which can be caused by smoking,Number of admissions,,1713330
2,2014/15,C00-D48,All cancers,All cancers,Number of admissions,,1691035
3,2014/15,J00-J99,All respiratory diseases,All respiratory diseases,Number of admissions,,611002
4,2014/15,I00-I99,All circulatory diseases,All circulatory diseases,Number of admissions,,907157


In [14]:
admissions_df.shape

(2079, 7)