In [25]:
# import libraries and read csv in a dataframe

import pandas as pd

df = pd.read_csv("Smoking_1980_2020.csv")

df.head(10)

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,SMOKERS,TOT,PC_POP15,A,1964,43.0,
1,AUS,SMOKERS,TOT,PC_POP15,A,1966,40.0,
2,AUS,SMOKERS,TOT,PC_POP15,A,1969,37.0,
3,AUS,SMOKERS,TOT,PC_POP15,A,1974,37.8,
4,AUS,SMOKERS,TOT,PC_POP15,A,1976,38.4,
5,AUS,SMOKERS,TOT,PC_POP15,A,1980,36.0,
6,AUS,SMOKERS,TOT,PC_POP15,A,1983,35.4,
7,AUS,SMOKERS,TOT,PC_POP15,A,1986,30.6,
8,AUS,SMOKERS,TOT,PC_POP15,A,1989,28.6,
9,AUS,SMOKERS,TOT,PC_POP15,A,1992,26.0,


In [26]:
# Check the data types

df.dtypes

LOCATION       object
INDICATOR      object
SUBJECT        object
MEASURE        object
FREQUENCY      object
TIME            int64
Value         float64
Flag Codes     object
dtype: object

In [27]:
# Drop the Flag Codes column

df = df.drop(columns=['Flag Codes'])
df.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
0,AUS,SMOKERS,TOT,PC_POP15,A,1964,43.0
1,AUS,SMOKERS,TOT,PC_POP15,A,1966,40.0
2,AUS,SMOKERS,TOT,PC_POP15,A,1969,37.0
3,AUS,SMOKERS,TOT,PC_POP15,A,1974,37.8
4,AUS,SMOKERS,TOT,PC_POP15,A,1976,38.4


In [28]:
# Change format of the the Time column to a string

df["TIME"]=df["TIME"].astype(str)

In [29]:
# Check the data types

df.dtypes

LOCATION      object
INDICATOR     object
SUBJECT       object
MEASURE       object
FREQUENCY     object
TIME          object
Value        float64
dtype: object

In [30]:
# combine location and time to create unique identifier

df["LocationTime"] = df[["LOCATION", "TIME"]].apply("-".join, axis=1)
pd.DataFrame=df
df.head(10)

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,LocationTime
0,AUS,SMOKERS,TOT,PC_POP15,A,1964,43.0,AUS-1964
1,AUS,SMOKERS,TOT,PC_POP15,A,1966,40.0,AUS-1966
2,AUS,SMOKERS,TOT,PC_POP15,A,1969,37.0,AUS-1969
3,AUS,SMOKERS,TOT,PC_POP15,A,1974,37.8,AUS-1974
4,AUS,SMOKERS,TOT,PC_POP15,A,1976,38.4,AUS-1976
5,AUS,SMOKERS,TOT,PC_POP15,A,1980,36.0,AUS-1980
6,AUS,SMOKERS,TOT,PC_POP15,A,1983,35.4,AUS-1983
7,AUS,SMOKERS,TOT,PC_POP15,A,1986,30.6,AUS-1986
8,AUS,SMOKERS,TOT,PC_POP15,A,1989,28.6,AUS-1989
9,AUS,SMOKERS,TOT,PC_POP15,A,1992,26.0,AUS-1992


In [31]:
# Change the Time Column dataformat back to int

df["TIME"]=df["TIME"].astype(int)
df.dtypes

LOCATION         object
INDICATOR        object
SUBJECT          object
MEASURE          object
FREQUENCY        object
TIME              int64
Value           float64
LocationTime     object
dtype: object

In [32]:
# Check for null values

df.isnull().any()

LOCATION        False
INDICATOR       False
SUBJECT         False
MEASURE         False
FREQUENCY       False
TIME            False
Value           False
LocationTime    False
dtype: bool

In [33]:
# Drop the rest of the unnecessary columns

df = df.drop(columns=['SUBJECT', 'MEASURE', 'FREQUENCY'])
df.head(10)

Unnamed: 0,LOCATION,INDICATOR,TIME,Value,LocationTime
0,AUS,SMOKERS,1964,43.0,AUS-1964
1,AUS,SMOKERS,1966,40.0,AUS-1966
2,AUS,SMOKERS,1969,37.0,AUS-1969
3,AUS,SMOKERS,1974,37.8,AUS-1974
4,AUS,SMOKERS,1976,38.4,AUS-1976
5,AUS,SMOKERS,1980,36.0,AUS-1980
6,AUS,SMOKERS,1983,35.4,AUS-1983
7,AUS,SMOKERS,1986,30.6,AUS-1986
8,AUS,SMOKERS,1989,28.6,AUS-1989
9,AUS,SMOKERS,1992,26.0,AUS-1992


In [34]:
# Check number of rows

num_rows = len(df)
print(num_rows)

2635


In [41]:
# Filter the Time column on values between 1990 and 2020

filtered_df = df.loc[(df['TIME'] >= 1990) & (df['TIME'] <= 2020)]
filtered_df.head(10)

Unnamed: 0,LOCATION,INDICATOR,TIME,Value,LocationTime
9,AUS,SMOKERS,1992,26.0,AUS-1992
10,AUS,SMOKERS,1995,24.1,AUS-1995
11,AUS,SMOKERS,1998,22.1,AUS-1998
12,AUS,SMOKERS,2001,19.6,AUS-2001
13,AUS,SMOKERS,2004,17.8,AUS-2004
14,AUS,SMOKERS,2007,16.9,AUS-2007
15,AUS,SMOKERS,2010,15.3,AUS-2010
16,AUS,SMOKERS,2013,13.0,AUS-2013
17,AUS,SMOKERS,2016,12.4,AUS-2016
18,AUS,SMOKERS,2019,11.2,AUS-2019


In [42]:
# Check the number of rows

num_rows = len(filtered_df)
print(num_rows)

1997


In [43]:
# Shift the LocationTime to the beginning of the DataFrame

column_to_move = filtered_df.pop('LocationTime')
filtered_df.insert(0, 'LocationTime', column_to_move)
filtered_df.head(10)

Unnamed: 0,LocationTime,LOCATION,INDICATOR,TIME,Value
9,AUS-1992,AUS,SMOKERS,1992,26.0
10,AUS-1995,AUS,SMOKERS,1995,24.1
11,AUS-1998,AUS,SMOKERS,1998,22.1
12,AUS-2001,AUS,SMOKERS,2001,19.6
13,AUS-2004,AUS,SMOKERS,2004,17.8
14,AUS-2007,AUS,SMOKERS,2007,16.9
15,AUS-2010,AUS,SMOKERS,2010,15.3
16,AUS-2013,AUS,SMOKERS,2013,13.0
17,AUS-2016,AUS,SMOKERS,2016,12.4
18,AUS-2019,AUS,SMOKERS,2019,11.2


In [44]:
# Check the number of rows

num_rows = len(filtered_df)
print(num_rows)

1997


In [45]:
# Save in a new csv file

filtered_df.to_csv('smoking_clean.csv', index=False)