## Import Libraries

In [1]:
import pandas as pd
import sweetviz as sv
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import chardet
import datetime as dt

## Import Data

In [2]:
df1=pd.read_excel(r"C:\Users\drmat\OneDrive\Documents\Career Foundry\COVID_Suicide\02 Data\01 Raw\Suicide\2019-Public Health Impact Suicide.xlsx", sheet_name='Ethnicity_1', index_col=False)

In [3]:
df2=pd.read_excel(r"C:\Users\drmat\OneDrive\Documents\Career Foundry\COVID_Suicide\02 Data\01 Raw\Suicide\2020-Public Health Impact Suicide.xlsx", sheet_name='Ethnicity_1', index_col=False)

In [4]:
df3=pd.read_excel(r"C:\Users\drmat\OneDrive\Documents\Career Foundry\COVID_Suicide\02 Data\01 Raw\Suicide\2021-Public Health Impact Suicide.xlsx", sheet_name='Ethnicity_1', index_col=False)

## Merge Dataframes

In [5]:
# use concat to merge all 3 dataframes
age_frames=[df1,df2,df3]
df=pd.concat(age_frames, axis=0)
df.reset_index()

Unnamed: 0,index,Edition,Report Type,Measure Name,State Name,Rank,Value,Score,Lower CI,Upper CI,Source,Source Year
0,0,2019,2019 Annual,American Indian/Alaska Native,Alabama,,,,,,"CDC WONDER, Multiple Cause of Death Files",2017.0
1,1,2019,2019 Annual,American Indian/Alaska Native,Alaska,,54.2,,69.4,41.7,"CDC WONDER, Multiple Cause of Death Files",2017.0
2,2,2019,2019 Annual,American Indian/Alaska Native,Arizona,,26.6,,33.0,21.2,"CDC WONDER, Multiple Cause of Death Files",2017.0
3,3,2019,2019 Annual,American Indian/Alaska Native,Arkansas,,,,,,"CDC WONDER, Multiple Cause of Death Files",2017.0
4,4,2019,2019 Annual,American Indian/Alaska Native,California,,17.2,,23.8,12.0,"CDC WONDER, Multiple Cause of Death Files",2017.0
...,...,...,...,...,...,...,...,...,...,...,...,...
775,255,2021,2021 Annual,White,West Virginia,24.0,19.0,0.20,21.1,16.9,AHR data,
776,256,2021,2021 Annual,White,Wisconsin,10.0,15.4,-0.59,16.5,14.3,AHR data,
777,257,2021,2021 Annual,White,Wyoming,50.0,30.7,2.00,35.6,25.8,AHR data,
778,258,2021,2021 Annual,White,United States,,18.1,,18.3,17.9,AHR data,


## Inspect Dataframe

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 780 entries, 0 to 259
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Edition       780 non-null    int64  
 1   Report Type   780 non-null    object 
 2   Measure Name  780 non-null    object 
 3   State Name    780 non-null    object 
 4   Rank          150 non-null    float64
 5   Value         438 non-null    float64
 6   Score         50 non-null     float64
 7   Lower CI      525 non-null    float64
 8   Upper CI      525 non-null    float64
 9   Source        780 non-null    object 
 10  Source Year   260 non-null    float64
dtypes: float64(6), int64(1), object(4)
memory usage: 73.1+ KB


## Clean Dataframe

In [7]:
#Drop/remove irrelevant column(s)  
df.drop(columns = ['Report Type','Rank','Score','Lower CI','Upper CI','Source','Source Year'],inplace=True)

In [8]:
df.head()

Unnamed: 0,Edition,Measure Name,State Name,Value
0,2019,American Indian/Alaska Native,Alabama,
1,2019,American Indian/Alaska Native,Alaska,54.2
2,2019,American Indian/Alaska Native,Arizona,26.6
3,2019,American Indian/Alaska Native,Arkansas,
4,2019,American Indian/Alaska Native,California,17.2


In [9]:
# rename columns to match other dataframes
df.rename(columns={df.columns[0]:'year',df.columns[1]:'ethnicity',df.columns[2]:'state',df.columns[3]:'suicide_deaths'},inplace=True)

In [10]:
# reorder dataframe to match other dataframes
df.iloc[:, [0,2,1,3]]

Unnamed: 0,year,state,ethnicity,suicide_deaths
0,2019,Alabama,American Indian/Alaska Native,
1,2019,Alaska,American Indian/Alaska Native,54.2
2,2019,Arizona,American Indian/Alaska Native,26.6
3,2019,Arkansas,American Indian/Alaska Native,
4,2019,California,American Indian/Alaska Native,17.2
...,...,...,...,...
255,2021,West Virginia,White,19.0
256,2021,Wisconsin,White,15.4
257,2021,Wyoming,White,30.7
258,2021,United States,White,18.1


In [11]:
#find rows with null vaues
df.isnull().sum()

year                0
ethnicity           0
state               0
suicide_deaths    342
dtype: int64

In [12]:
#find percent of null values per row
total_rows= len(df)
total_missing_rows = df.isnull().sum()
percent_missing_per_row = round((total_missing_rows/total_rows)* 100,2)
percent_missing_per_row[percent_missing_per_row > 0]

suicide_deaths    43.85
dtype: float64

In [13]:
# impute missing values with median
df['suicide_deaths'].fillna(df['suicide_deaths'].median(),inplace=True)

In [14]:
# recheck rows with null vaues
df.isnull().sum()

year              0
ethnicity         0
state             0
suicide_deaths    0
dtype: int64

In [15]:
# check unique value counts
df['year'].value_counts()

2019    260
2020    260
2021    260
Name: year, dtype: int64

In [16]:
# check unique value counts
df['state'].value_counts().sort_index(ascending=True)

Alabama                 15
Alaska                  15
Arizona                 15
Arkansas                15
California              15
Colorado                15
Connecticut             15
Delaware                15
District of Columbia    15
Florida                 15
Georgia                 15
Hawaii                  15
Idaho                   15
Illinois                15
Indiana                 15
Iowa                    15
Kansas                  15
Kentucky                15
Louisiana               15
Maine                   15
Maryland                15
Massachusetts           15
Michigan                15
Minnesota               15
Mississippi             15
Missouri                15
Montana                 15
Nebraska                15
Nevada                  15
New Hampshire           15
New Jersey              15
New Mexico              15
New York                15
North Carolina          15
North Dakota            15
Ohio                    15
Oklahoma                15
O

In [17]:
# check unique value counts
df['ethnicity'].value_counts().sort_index(ascending=True)

American Indian/Alaska Native    156
Asian/Pacific Islander           156
Black                            156
Hispanic                         156
White                            156
Name: ethnicity, dtype: int64

In [18]:
# check descriptive statistics of suicide_deaths column
round(df['suicide_deaths'].describe(),2)

count    780.00
mean      12.35
std        6.45
min        3.40
25%        9.50
50%       10.80
75%       11.92
max       66.80
Name: suicide_deaths, dtype: float64

In [19]:
# check for outliers
# create a function to find outliers using IQR

def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [20]:
# run defined function on suicide_deaths column
outliers = find_outliers_IQR(df['suicide_deaths'])

print('number of outliers: '+ str(len(outliers)))

print('max outlier value: '+ str(outliers.max()))

print('min outlier value: '+ str(outliers.min()))

outliers

number of outliers: 203
max outlier value: 66.8
min outlier value: 3.4


1      54.2
2      26.6
4      17.2
23     28.6
26     41.7
       ... 
253    16.9
254    18.9
255    19.0
257    30.7
258    18.1
Name: suicide_deaths, Length: 203, dtype: float64

In [21]:
# remove outliers by imputing with mean
# create a function to impute the mean
def impute_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   upper = df[~(df>(q3+1.5*IQR))].max()

   lower = df[~(df<(q1-1.5*IQR))].min()

   df = np.where(df > upper,

       df.mean(),

       np.where(

           df < lower,

           df.mean(),

           df

           )

       )

   return df

In [22]:
# run defined function on ouliers in suicide_deaths column
df['suicide_deaths'] = impute_outliers_IQR(df['suicide_deaths'])

In [23]:
# recheck descriptive statistics of suicide_deaths column
round(df['suicide_deaths'].describe(),2)

count    780.00
mean      10.69
std        1.86
min        5.90
25%       10.80
50%       10.80
75%       12.35
max       15.40
Name: suicide_deaths, dtype: float64

In [24]:
# create dataframe groups to match other dataframes
dfgroup=df.groupby(['year','state','ethnicity'])[['suicide_deaths']].agg('sum')

In [25]:
# round suicide_deaths to 2 decimal placed
dfgroup['suicide_deaths']=dfgroup['suicide_deaths'].round(2)

In [26]:
# check grouping dataframe
dfgroup.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,suicide_deaths
year,state,ethnicity,Unnamed: 3_level_1
2019,Alabama,American Indian/Alaska Native,10.8
2019,Alabama,Asian/Pacific Islander,10.8
2019,Alabama,Black,12.35
2019,Alabama,Hispanic,10.8
2019,Alabama,White,12.35
2019,Alaska,American Indian/Alaska Native,12.35
2019,Alaska,Asian/Pacific Islander,10.8
2019,Alaska,Black,10.8
2019,Alaska,Hispanic,10.8
2019,Alaska,White,12.35


In [27]:
#Use sweetviz to generate report
my_report = sv.analyze(df)
my_report.show_html('Ethnicity Suicides 2019-2021.html')

                                             |          | [  0%]   00:00 -> (? left)

Report Ethnicity Suicides 2019-2021.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
