In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
from dython.nominal import associations
from dython.nominal import identify_nominal_columns
plt.rcParams['font.size'] = 18.0

%matplotlib inline

pd.set_option('display.max_columns', 500)

### reading in crisis1

In [None]:
#https://stackoverflow.com/questions/26521266/using-pandas-to-pd-read-excel-for-multiple-worksheets-of-the-same-workbook
crisis1 = pd.ExcelFile('../data/NSSCrisisv.1.xlsx')

In [None]:
crisis1.sheet_names

In [None]:
crisis1 = crisis1.parse('in')

In [None]:
crisis1.head(20)

In [None]:
crisis1.info()

### cleaning up crisis1

#drop columns with ALL NaN values
#https://datatofish.com/drop-columns-with-nan/
crisis1 = crisis1.dropna(axis=1, how ='all')
crisis1 = crisis1.dropna(axis=0, how='all')
crisis1.head()

In [None]:
#trim all whitespace
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha
crisis1 = crisis1.dropna(axis=1, how ='all')
crisis1 = crisis1.dropna(axis=0, how='all')
crisis1 = crisis1.apply(lambda x: x.str.strip() if x.dtype == "str" else x)

In [None]:
#replace column names with row1 values
#https://www.adamsmith.haus/python/answers/how-to-convert-a-pandas-dataframe-row-to-column-headers-in-python
header_row = 1
crisis1.columns = crisis1.iloc[header_row]
crisis1.head()

In [None]:
crisis1 = crisis1.drop(header_row)
crisis1 = crisis1.reset_index(drop=True)
crisis1.head()

In [None]:
crisis1.tail()

### reading in crisis2

In [None]:
crisis2 = pd.ExcelFile('../data/NSSCrisisv.2.xlsx')

In [None]:
crisis2.sheet_names

In [None]:
crisis2 = crisis2.parse('in')

In [None]:
crisis2.head(20)

In [None]:
crisis2.tail()

In [None]:
crisis2.info()

### cleaning up crisis2

In [None]:
#drop columns with ALL NaN values
#https://datatofish.com/drop-columns-with-nan/
crisis2 = crisis2.dropna(axis=1, how ='all')
crisis2 = crisis2.dropna(axis=0, how='all')
crisis2.head()

In [None]:
#trim all whitespace
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha
crisis2 = crisis2.apply(lambda x: x.str.strip() if x.dtype == "str" else x)

In [None]:
#replace column names with row1 values
#https://www.adamsmith.haus/python/answers/how-to-convert-a-pandas-dataframe-row-to-column-headers-in-python
crisis2.columns = crisis2.iloc[header_row]
crisis2 = crisis2.drop(header_row)
crisis2 = crisis2.reset_index(drop=True)
crisis2.head()

### reading in call_volume

In [None]:
#https://stackoverflow.com/questions/26521266/using-pandas-to-pd-read-excel-for-multiple-worksheets-of-the-same-workbook
call_volume = pd.ExcelFile('../data/Contact_center_call_volume_2020-2022.xlsx')
call_volume.sheet_names

In [None]:
call_volume2020 = call_volume.parse('2020')

In [None]:
call_volume2021 = call_volume.parse('2021')

In [None]:
call_volume2022 = call_volume.parse('2022')

In [None]:
call_volume2020.head()

In [None]:
call_volume2021.head(20)

### combining crisis1 and crisis2

In [None]:
crisis = pd.concat([crisis2, crisis1])

In [None]:
crisis.info()

In [None]:
crisis = crisis.reset_index(drop=True)
crisis.head(20)

In [None]:
crisis.tail(20)

In [None]:
crisis.shape

## 1. Examine call volume and identify surge times, seasons, or events.

### Ideas: 2021 vs 2022, avg call volume per month, avg call volume per day of the. month

In [None]:
crisis['CallDateAndTimeStart'].dtype

In [None]:
print(type(crisis['CallDateAndTimeStart']))

In [None]:
#this really won't work since the time is included - let's get a column w/ just the date
crisis['CallDateAndTimeStart'].value_counts()

In [None]:
#this gave a weird result: crisis['CallDateAndTimeStart'] = str(crisis['CallDateAndTimeStart'])
#maybe due to python versions? solution pieced together from here: https://stackoverflow.com/questions/22231592/pandas-change-data-type-of-series-to-string
crisis['CallDateAndTimeStart'] = crisis['CallDateAndTimeStart'].astype('str')

In [None]:
crisis['call_date'] = crisis.CallDateAndTimeStart.str[0:10]

In [None]:
#make a call year column also
crisis['call_year'] = crisis.CallDateAndTimeStart.str[0:4]

In [None]:
crisis['call_month'] = crisis.CallDateAndTimeStart.str[0:7]

In [None]:
crisis.head(30)

In [None]:
#trying again with just the date (new column we just created)
#let's make this a dataframe for analysis
call_dates = crisis['call_date'].value_counts()
call_dates = pd.DataFrame(call_dates).reset_index()
call_dates.columns = ['date', 'number_calls']
call_dates.head(50)

In [None]:
call_dates.describe()

In [None]:
#going to order call_dates by date so that when we plot it, we can see trends over time
call_dates = call_dates.sort_values(by='date')
call_dates.head()

In [None]:
call_dates.plot(x="date", y="number_calls", kind="line")
plt.show()

In [None]:
#make subsets of data by year. first, double check which years data comes from
crisis['call_year'].value_counts()

In [None]:
#create 2020 filter
filter2020 = crisis['call_year'].isin(['2020'])

#make new dataframe based on filter
calls2020 = crisis[filter2020]
calls2020.head()

In [None]:
#create 2021 filter
filter2021 = crisis['call_year'].isin(['2021'])

#make new dataframe based on filter
calls2021 = crisis[filter2021]
calls2021.head()

In [None]:
#create 2022 filter
filter2022 = crisis['call_year'].isin(['2022'])

#make new dataframe based on filter
calls2022 = crisis[filter2022]
calls2022.head()

In [None]:
#clearly that's way too many data points, so let's look at the top 50 days
#call_dates_top50 = 

## 2. Analyze trends in call content by looking for which issues are most frequent and what is the average count of issues per call.

In [None]:
#first, let's try to isolate columns with "CRISIS" in the title
#(based on README, that's what we're looking for)
#https://towardsdatascience.com/interesting-ways-to-select-pandas-dataframe-columns-b29b82bbfb33
#(at that link, see section on selecting columns based on substring)

crisis_issues = crisis.loc[:,['CRISIS Issues' in i for i in crisis.columns]]
crisis_issues.head()

In [None]:
#how many non-NAN values per column? and make it a dataframe
crisis_notes = crisis_issues.count().to_frame().reset_index()

#rename columns
crisis_notes.columns = ['crisis_issue', 'call_volume']

#sort values by call_volume
crisis_notes = crisis_notes.sort_values(by='call_volume', ascending=False)

crisis_notes.head(20)

## 3. Assess regional and demographic trends in call content and call volume.

In [None]:
crisis.head(10)

In [None]:
 crisis['CRISIS Call Information - Phone Line'].value_counts()

In [None]:
#let's start with regional information
crisis['StateProvince'].value_counts()

In [None]:
#if we could find json for these counties, something geospatial could be cool
crisis['CountyName'].value_counts()

In [None]:
#if we could find json for these zipcodes, something geospatial could be cool
crisis['PostalCode'].value_counts()

In [None]:
#looking into demographics now
demographics = crisis.loc[:,['Demographics' in i for i in crisis.columns]]
demographics.head(20)

In [None]:
#let's add in some date & regional information as well
demographics['call_year'] = crisis['call_year']
demographics['call_month'] = crisis['call_month']
demographics['call_source'] = crisis['CRISIS Call Information - Phone Line']
demographics['zipcode'] = crisis['PostalCode']
demographics['issue_suicide'] = crisis['CRISIS Issues - Suicide']
demographics['issue_financial'] = crisis['CRISIS Issues - Financial/Basic Needs']
demographics['issue_emotional_state'] = crisis['CRISIS Issues - Emotional State']
demographics['issue_relationship'] = crisis['CRISIS Issues - Relationships']
demographics['issue_mental_health'] = crisis['CRISIS Issues - Mental Health']
demographics['issue_physical_health'] = crisis['CRISIS Issues - Health/Physical']
demographics['issue_substances'] = crisis['CRISIS Issues - Substances']
demographics.head()

In [None]:
#was getting an error about datetime when running categorical associations below
#I think it's because one of the age categories is being read as datetime (7-12)
#let's try converting to str and see if that helps
demographics['CRISIS Demographics - Age'] = demographics['CRISIS Demographics - Age'].astype('str')

In [None]:
#to make correlation table more manageable, let's drop columns w/ less than 1000 values
demographics1000 = demographics.dropna(thresh=1000, axis=1)
demographics1000.head()

In [None]:
#as expected, all demographic variables are categorical
#https://blog.knoldus.com/how-to-find-correlation-value-of-categorical-variables/
associations(demographics1000, nominal_columns='auto', numerical_columns=None, mark_columns=False, nom_nom_assoc='cramer', num_num_assoc='pearson', ax=None, figsize=None, annot=True, fmt='.2f', cmap=None, sv_color='silver', cbar=True, vmax=1.0, vmin=None, plot=True, compute_only=False, clustering=False, title=None, filename=None)

In [None]:
#https://blog.knoldus.com/how-to-find-correlation-value-of-categorical-variables/
complete_correlation= associations(demographics1000, filename= 'complete_correlation.png', figsize=(30,30))

In [None]:
#https://blog.knoldus.com/how-to-find-correlation-value-of-categorical-variables/
df_complete_corr=complete_correlation['corr']
df_complete_corr.dropna(axis=1, how='all').dropna(axis=0, how='all').style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

In [None]:
#which demographic factors have the most data collected? (analysis will be best w/ more data)
demographics.count()

In [None]:
#would be interesting to compare whether callers were suicidal to see if there's a relationship
demographics['CRISIS Demographics - Marital Status'].value_counts()

In [None]:
crisis.groupby('call_year')['CRISIS Demographics - Marital Status'].value_counts(normalize=True)

In [None]:
#exploring correlation between marital status and homelessness
#https://stackoverflow.com/questions/42563209/how-to-count-subgroups-of-categorical-data-in-a-pandas-dataframe
print(pd.crosstab(demographics['CRISIS Demographics - Marital Status'], demographics['CRISIS Demographics - Homeless?']))

In [None]:
#exploring correlation between mental health and suicidality
#https://stackoverflow.com/questions/42563209/how-to-count-subgroups-of-categorical-data-in-a-pandas-dataframe
suicide_mh = pd.crosstab(demographics['issue_suicide'], demographics['issue_mental_health'])

In [None]:
suicide_mh.head()

In [None]:
#drop rows with zeros
#https://stackoverflow.com/questions/22649693/drop-rows-with-all-zeros-in-pandas-data-frame
suicide_mh_dropzeros = suicide_mh.loc[(suicide_mh!=0).any(axis=1)]

#drop columns with zeros
#suicide_mh_dropzeros = suicide_mh.loc[(suicide_mh!=0).any(axis=0)]
suicide_mh_dropzeros = suicide_mh_dropzeros.loc[:, (suicide_mh_dropzeros != 0).any(axis=0)]

suicide_mh_dropzeros.head(20)

In [None]:
#maxvalues of all columns
#https://www.geeksforgeeks.org/find-maximum-values-position-in-columns-and-rows-of-a-dataframe-in-pandas/
maxValues = suicide_mh_dropzeros.max()
maxValues = pd.DataFrame(maxValues).reset_index()
maxValues.columns = ['issue_mental_health', 'count']
maxValues = maxValues.sort_values(by=['count'], ascending=False)
maxValues.head(50)

In [None]:
maxValueIndex = suicide_mh_dropzeros.idxmax(axis = 1)
print(maxValueIndex)

In [None]:
#interesting trends... would need to see how many are for repeat callers
demographics['CRISIS Demographics - Occupation'].value_counts()

In [None]:
demographics['CRISIS Demographics - Military'].value_counts()

In [None]:
demographics['CRISIS Demographics - Gender Identity'].value_counts()

In [None]:
#female to male ratio slowly shrinks with time
gender_year = crisis.groupby('call_year')['CRISIS Demographics - Gender Identity'].value_counts(normalize=True)
gender_year = pd.DataFrame(gender_year)
gender_year.head(30)

In [None]:
demographics['CRISIS Demographics - Age'].value_counts()

In [None]:
calls2020['CRISIS Demographics - Age'].value_counts()

In [None]:
#wanted to see if there was a way to get value counts by percentage and there IS!
#https://stackoverflow.com/questions/14281871/given-a-pandas-series-that-represents-frequencies-of-a-value-how-can-i-turn-tho
calls2020['CRISIS Demographics - Age'].value_counts(normalize=True)

In [None]:
#caitlyn b also told me there is a way to "group by" with values counts! exciting.. let's try
crisis.groupby('call_year')['CRISIS Demographics - Age'].value_counts()

In [None]:
crisis.groupby('CRISIS Demographics - Age')['CRISIS Demographics - Gender Identity'].value_counts()

In [None]:
demographics['CRISIS Demographics - Race/Ethnicity'].value_counts()

In [None]:
demographics['CRISIS Demographics - Has a disability?'].value_counts()

In [None]:
demographics['CRISIS Demographics - Homeless?'].value_counts()

In [None]:
#exploring correlation between disability and homelessness
#https://stackoverflow.com/questions/42563209/how-to-count-subgroups-of-categorical-data-in-a-pandas-dataframe
print(pd.crosstab(demographics['CRISIS Demographics - Has a disability?'], demographics['CRISIS Demographics - Homeless?']))

In [None]:
crisis.groupby('call_year')['CRISIS Demographics - Has a disability?'].value_counts(normalize=True)

## 4. Determine the count and success rate of imminent risk calls.

In [None]:
#find imminent risk column(s)
imminent = crisis.loc[:,['Imminent' in i for i in crisis.columns]]
imminent.head()

In [None]:
#find success column(s)
success = crisis.loc[:,['Success' in i for i in crisis.columns]]
success.head()