In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
from ipywidgets import interactive
pd.set_option('display.max_columns', 500)

#### Abigail's Method for Reading Excel Files and Formatting Columns

In [None]:
crisis1 = pd.ExcelFile('../data/NSSCrisisv.1.xlsx')

In [None]:
crisis1.sheet_names

In [None]:
crisis1 = crisis1.parse('in')

In [None]:
crisis1.head(10)

In [None]:
crisis1.info()

In [None]:
crisis1 = crisis1.apply(lambda x: x.str.strip() if x.dtype == "str" else x)

In [None]:
header_row = 1
crisis1.columns = crisis1.iloc[header_row]
crisis1.head()

In [None]:
crisis1 = crisis1.drop(header_row)
crisis1 = crisis1.reset_index(drop=True)
crisis1.head()

In [None]:
crisis1.tail()

In [None]:
crisis2 = pd.ExcelFile('../data/NSSCrisisv.2.xlsx')

In [None]:
crisis2.sheet_names

In [None]:
crisis2 = crisis2.parse('in')

In [None]:
crisis2.head()

In [None]:
crisis2.tail()

In [None]:
crisis2.info()

In [None]:
crisis2 = crisis2.dropna(axis=1, how ='all')
crisis2 = crisis2.dropna(axis=0, how='all')
crisis2.head()

In [None]:
crisis2 = crisis2.apply(lambda x: x.str.strip() if x.dtype == "str" else x)

In [None]:
crisis2.columns = crisis2.iloc[header_row]
crisis2 = crisis2.drop(header_row)
crisis2 = crisis2.reset_index(drop=True)
crisis2.head()

In [None]:
crisis = pd.concat([crisis2, crisis1])

In [None]:
crisis = crisis.reset_index(drop=True)
crisis.head()

In [None]:
crisis.shape

In [None]:
crisis.info()

#### EDA & Demographics (Occupation & Education Deep Dive)

In [None]:
crisis['CallDateAndTimeStart'] = crisis['CallDateAndTimeStart'].astype('str')

In [None]:
crisis['call_date'] = crisis.CallDateAndTimeStart.str[0:10]

In [None]:
crisis.head()

In [None]:
call_dates = crisis['call_date'].value_counts()
call_dates = pd.DataFrame(call_dates).reset_index()
call_dates.columns = ['date', 'number_calls']
call_dates.head()

In [None]:
crisis_issues = crisis.loc[:,['CRISIS Issues' in i for i in crisis.columns]]
crisis_issues.head()

In [None]:
crisis_notes = crisis_issues.count().to_frame().reset_index()

In [None]:
crisis_notes.columns = ['crisis_issue', 'call_volume']

In [None]:
crisis_notes = crisis_notes.sort_values(by='call_volume', ascending=False)
crisis_notes.head(20)

In [None]:
demographics = crisis.loc[:,['Demographics' in i for i in crisis.columns]]
demographics.head()

In [None]:
demographics.count()

In [None]:
crisis['call_year'] = crisis.CallDateAndTimeStart.str[0:4]

In [None]:
filter2020 = crisis['call_year'].isin(['2020'])

In [None]:
calls2020 = crisis[filter2020]
calls2020.head()

In [None]:
filter2021 = crisis['call_year'].isin(['2021'])

In [None]:
calls2021 = crisis[filter2021]
calls2021.head()

In [None]:
filter2022 = crisis['call_year'].isin(['2022'])

In [None]:
calls2022 = crisis[filter2022]
calls2022.head()

#### Now that files are in as Excel and dates are formatted with year first, should be able to pick up previous demographic EDA work and look more closely at year and date trends for occupation and education calls.

##  Demographics Deep Dive - Occupation & Education

In [None]:
crisis['CRISIS Demographics - Occupation'].value_counts()

In [None]:
crisis['CRISIS Demographics - Occupation'].value_counts(normalize=True)

In [None]:
crisis.groupby('CRISIS Demographics - Education Level')['CRISIS Demographics - Occupation'].value_counts()

In [None]:
goodwill = crisis.loc[crisis['CRISIS Demographics - Occupation'] == 'Works at Goodwill']
goodwill.head()

In [None]:
goodwill['CallerNum'].value_counts()
#Does this mean its the same caller?

In [None]:
top5occ = crisis.loc[crisis['CRISIS Demographics - Occupation'].isin(['Works at Goodwill', 'News Anchor', 'Construction Worker', 'bagger at Kroger', 'Substitute Teacher'])]
top5occ.head()

In [None]:
top5occ['CallerNum'].value_counts()
#Again, are we looking at a singular caller for each top occupation?

In [None]:
crisis['CallerNum'].value_counts()

In [None]:
crisis.groupby('CallerNum')['CRISIS Demographics - Occupation'].value_counts()

In [None]:
top5occ['CRISIS Call Information - Call Type'].value_counts(normalize=True)

In [None]:
top5occ['CRISIS Call Information - Call Type'].value_counts()

In [None]:
top5occ['CRISIS Demographics - Age'].value_counts(normalize=True)

In [None]:
top5occ['CRISIS Demographics - Age'].value_counts()

In [None]:
top5occ['CallLength'].describe()

In [None]:
top5occ['call_year'].value_counts(normalize=True)

In [None]:
top5occ['call_year'].value_counts()

In [None]:
crisis.groupby('call_year')['CRISIS Demographics - Occupation'].value_counts()

In [None]:
top5occ.groupby('CRISIS Demographics - Occupation')['call_year'].value_counts()

In [None]:
top5occ.groupby('CRISIS Demographics - Occupation')['call_year'].value_counts(normalize=True)

In [None]:
goodwill['call_date'].value_counts()

In [None]:
newsanchor = crisis.loc[crisis['CRISIS Demographics - Occupation'] == 'News Anchor']
newsanchor.head()

In [None]:
construction = crisis.loc[crisis['CRISIS Demographics - Occupation'] == 'Construction Worker']
construction.head()

In [None]:
subteacher = crisis.loc[crisis['CRISIS Demographics - Occupation'] == 'Substitute Teacher']
subteacher.head()

In [None]:
subteacher = crisis.loc[crisis['CRISIS Demographics - Occupation'] == 'Substitute Teacher']
subteacher.head()

In [None]:
krogerbagger = crisis.loc[crisis['CRISIS Demographics - Occupation'] == 'bagger at Kroger']
krogerbagger.head()

In [None]:
newsanchor['CallLength'].describe()

In [None]:
goodwill['CallLength'].describe()

In [None]:
construction['CallLength'].describe()

In [None]:
subteacher['CallLength'].describe()

In [None]:
krogerbagger['CallLength'].describe()

In [None]:
top5occ['PostalCode'].value_counts()

In [None]:
top5occ['PostalCode'].value_counts(normalize=True)

In [None]:
top5occ['call_date'].value_counts()

In [None]:
crisis.groupby('PostalCode')['CRISIS Demographics - Education Level'].value_counts()

In [None]:
top5zip = crisis.loc[crisis['PostalCode'].isin([37075, 37115, 38012, 37211, 37138])]
top5zip.head()

In [None]:
top5zip.groupby('PostalCode')['CRISIS Demographics - Education Level'].value_counts(normalize=True)

In [None]:
top5zip.groupby('PostalCode')['CRISIS Demographics - Education Level'].value_counts()

In [None]:
top5zip.groupby('PostalCode')['call_year'].value_counts(normalize=True)

In [None]:
top5zip.groupby('PostalCode')['call_year'].value_counts()

In [None]:
crisis.groupby('CRISIS Demographics - Education Level')['call_year'].value_counts(normalize=True)

In [None]:
crisis.groupby('CRISIS Demographics - Education Level')['call_year'].value_counts()

In [None]:
calls2022['CRISIS Demographics - Occupation'].value_counts()

In [None]:
crisis.groupby('call_year')['PostalCode'].value_counts()

In [None]:
calls2022['PostalCode'].value_counts()

In [None]:
crisis['call_year'] = pd.to_numeric(crisis['call_year'], errors='coerce')

In [None]:
g = sns.FacetGrid(crisis.loc[crisis['CRISIS Demographics - Occupation'].isin(['Works at Goodwill', 'News Anchor', 'Construction Worker', 'bagger at Kroger', 'Substitute Teacher'])], col='CRISIS Demographics - Occupation', hue_kws={'color' : ['darkgray']})
g.map_dataframe(sns.barplot, x='call_year', y='call_year', ci=None)
g.set_axis_labels('Top 5 Occupations', 'Call Volume')
g.set_titles(col_template='{col_name}')
plt.savefig('Top 5 Occupations - Call Volume Per Year.png');

In [None]:
yearlyocc_calls = top5occ.groupby('CRISIS Demographics - Occupation')['call_year'].value_counts().unstack()

In [None]:
yearlyocc_calls

In [None]:
ax = yearlyocc_calls.plot(kind='bar', figsize=(7,5), xlabel='Occupation', ylabel='Call Volume', rot=0, color=['#440154', '#3B528B'])
ax.legend(title='Call Year', bbox_to_anchor=(1,1), loc='upper left')
plt.xticks(rotation=60)
plt.tight_layout()
plt.savefig('Call Volume by Year for Top 5 Occupations')
plt.show();

In [None]:
phoneline = crisis.groupby('CRISIS Call Information - Phone Line')['call_year'].value_counts().unstack()

In [None]:
phoneline

In [None]:
ax = phoneline.plot(kind='bar', figsize=(10,10), xlabel='Phone Line', ylabel='Call Volume', rot=0, color=['#440154', '#3B528B', '#21918C'])
ax.legend(title='Call Year', bbox_to_anchor=(1,1), loc='upper left')
plt.xticks(rotation=60)
plt.tight_layout()
plt.savefig('Annual Call Volume by Phone Line')
plt.show();

plt.rcParams['figure.figsize'] = (15,9)
for col in phoneline.columns:
    plt.plot(phoneline[col], linewidth=2, label=col, color=['#440154', '#3B528B', '#21918C'])
plt.xlabel('Phone Line', fontsize=20)
plt.ylabel('Call Year', fontsize=20)
plt.xticks(fontsize=18, rotation=60)
plt.yticks(fontsize=18)
plt.legend(fontsize=18)
#plt.set_cmap('viridis')
plt.show()

ax_cis = cisgender_year.plot(kind='line', figsize=(10, 10), cmap="viridis")
cis_labels = ['Female', 'Male']
ax_cis.legend(title='Cisgender Individuals', loc='center left', labels=cis_labels)
plt.title('Cisgender Call Volume by Year')

#hide x axis title
ax_cis.set(xlabel=None)


In [None]:
top5occ['CRISIS Demographics - Homeless?'].value_counts()

In [None]:
top5occ['CRISIS Demographics - Has a disability?'].value_counts()

In [None]:
top5occ['CRISIS Demographics - Ethnicity'].value_counts()

In [None]:
top5occ['CRISIS Demographics - Gender Identity'].value_counts()

In [None]:
top5occ['CRISIS Demographics - Income'].value_counts()

In [None]:
top5occ['CRISIS Demographics - Military'].value_counts()

In [None]:
top5zip['CRISIS Demographics - Homeless?'].value_counts()

In [None]:
top5zip['CRISIS Demographics - Has a disability?'].value_counts()

In [None]:
top5zip['CRISIS Demographics - Gender Identity'].value_counts()

In [None]:
top5zip['CRISIS Demographics - Income'].value_counts()

In [None]:
top5zip['CRISIS Demographics - Military'].value_counts()

In [None]:
top5occ['CRISIS Call Information - Phone Line'].value_counts()

In [None]:
top5zip['CRISIS Call Information - Phone Line'].value_counts()

In [None]:
sns.barplot(x='CRISIS Demographics - Occupation', y='CRISIS Call Information - Phone Line', data=top5occ);