### Four Types of Analytics
![chart](../images/4-types-of-data-analytics-01.png)

In [None]:
#from shapely.geometry import Point
import pandas as pd
import numpy as np
import math
import geopandas as gpd
#geopandas reads files with the `read_file()` method
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
%matplotlib inline
!pip install pysal
import pysal.viz.mapclassify as mc
#!pip install seaborn
#!pip install plotly
#import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster

#### Let's get some data and take a look

In [None]:
NSSCrisis1_df = pd.read_csv('../Data/NSSCrisisv.1.csv')

In [None]:
NSSCrisis2_df = pd.read_csv('../Data/NSSCrisisv.2.csv')

In [None]:
CallCenter_2020 = pd.read_csv('../data/Contact_center_call_volume_2020.csv')

In [None]:
CallCenter_2021 = pd.read_csv('../data/Contact_center_call_volume_2021.csv')

In [None]:
CallCenter_2022 = pd.read_csv('../data/Contact center call volume_2022.csv')

In [None]:
NSSCrisis1_df.shape

In [None]:
NSSCrisis2_df.shape

In [None]:
CallCenter_2020.shape

In [None]:
CallCenter_2021.shape

In [None]:
CallCenter_2022.shape

In [None]:
NSSCrisis1_df.head()

In [None]:
header_row = 1

In [None]:
NSSCrisis1_df.columns = NSSCrisis1_df.iloc[header_row]

In [None]:
NSSCrisis1_df = NSSCrisis1_df.drop(header_row)

In [None]:
NSSCrisis1_df = NSSCrisis1_df.reset_index(drop=True)
NSSCrisis1_df.head()

In [None]:
NSSCrisis2_df.head()

In [None]:
NSSCrisis2_df.columns = NSSCrisis2_df.iloc[header_row]

In [None]:
NSSCrisis2_df = NSSCrisis2_df.drop(header_row)

In [None]:
NSSCrisis2_df = NSSCrisis2_df.reset_index(drop=True)
NSSCrisis2_df.head()

In [None]:
NSSCrisis_df = pd.concat([NSSCrisis1_df, NSSCrisis2_df], axis=0, ignore_index=True)

In [None]:
NSSCrisis_df.shape

In [None]:
NSSCrisis_df.head()

In [None]:
CallCenter_df_temp = pd.concat([CallCenter_2020, CallCenter_2021], axis=0, ignore_index=True)

In [None]:
CallCenter_df_temp.shape

In [None]:
CallCenter_df = pd.concat([CallCenter_df_temp, CallCenter_2022], axis=0, ignore_index=True)

In [None]:
CallCenter_df.shape

In [None]:
NSSCrisis_df.head()

In [None]:
#header_row = 1

In [None]:
#NSSCrisis_df.columns = NSSCrisis_df.iloc[header_row]

In [None]:
#NSSCrisis_df = NSSCrisis_df.drop(header_row)

In [None]:
#NSSCrisis_df = NSSCrisis_df.reset_index(drop=True)
#NSSCrisis_df.head()

In [None]:
NSSCrisis_df = NSSCrisis_df.drop(0)

In [None]:
NSSCrisis_df = NSSCrisis_df.reset_index(drop=True)

In [None]:
NSSCrisis_df.head()

In [None]:
print(list(NSSCrisis_df.columns))

In [None]:
#index shows column index -- can be used to exclude columns with nan data
idx_dic = {}
for col in NSSCrisis_df.columns:
    idx_dic[col] = 63528 - NSSCrisis_df[col].isna().sum()
#print(idx_dic)
index_col_nan = pd.DataFrame(list(idx_dic.items()),columns = ['col_name','num_good_data'])
index_col_nan['%'] = ((index_col_nan['num_good_data'])/63528)*100
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
index_col_nan
#pd.reset_option('all')

In [None]:
good_data_df = index_col_nan.sort_values(['%'], ascending=[False])
good_data_df

In [None]:
NSSCrisis_df['PostalCode'].value_counts()
#df2=df[df['Fee']==22000]['Courses'].values[0]

In [None]:
NSSCrisis_df['CRISIS Demographics - Military'].value_counts(normalize=True)

In [None]:
NSSCrisis_df['CRISIS Demographics - Gender Identity'].value_counts()

### 1. Examine call volume and identify surge times, seasons, or events.

In [None]:
#Read first row
NSSCrisis_df.iloc[0]

In [None]:
NSSCrisis_df['Year'] = NSSCrisis_df['CallDateAndTimeStart'].str.extract('\d{2}/\d{2}/(\d{4})', expand=True)
NSSCrisis_df['Year'].head()

In [None]:
NSSCrisis_df['Date'] = NSSCrisis_df['CallDateAndTimeStart'].str.extract('(\d{2}/\d{2}/\d{4})', expand=True)
NSSCrisis_df['Date'].head()

In [None]:
NSSCrisis_df['Day'] = NSSCrisis_df['CallDateAndTimeStart'].str.extract('\d{2}/(\d{2})/\d{4}', expand=True)
NSSCrisis_df['Day'].head()

In [None]:
NSSCrisis_df['Month'] = NSSCrisis_df['CallDateAndTimeStart'].str.extract('(\d{2})/\d{2}/\d{4}', expand=True)
NSSCrisis_df['Month'].head()

### 3. Assess regional and demographic trends in call content and call volume.

In [None]:
call_counts = pd.DataFrame(NSSCrisis_df['CallerNum'].value_counts().reset_index().values, columns=['call_num', 'Count'])
call_counts_index = call_counts.sort_index(axis=0, ascending=True)
call_counts_index.head()
#for loop to group call volume into buckets
def assign_length(row):
    if row <= 20:
        result = "2 - 20 calls"
    elif row <= 50:
        result = "21 - 50 calls"
    elif row <= 100:
        result = "51 - 100 calls"
    elif row <= 250:
        result = "101 - 250 calls"
    elif row <= 500:
        result = "251 - 500 calls"
    elif row <= 750:
        result = "501 - 750 calls"
    elif row <= 1000:
        result = "751 - 1000 calls"
    else:
        result = "More than 1000 calls"
    return result
call_counts_index['Call_Volume_Group'] = call_counts_index['Count'].apply(assign_length)
call_counts_index.head()

In [None]:
extracted_col = NSSCrisis_df["PostalCode"]

In [None]:
call_counts_index = call_counts_index.join(extracted_col)
call_counts_index.head()

In [None]:
#Looking at Military Data
NSSCrisis_df.groupby(['CRISIS Demographics - Military','Year'])['CallReportNum'].count()

In [None]:
NSSCrisis_df.CallerNum.

In [None]:
NSSCrisis_df.groupby(['PostalCode','CallerNum'])['CallerNum'].count().sort_values(ascending=False)

In [None]:
NSSCrisis_df.PostalCode.head(10)

In [None]:
NSSCrisis_df.PostalCode.dtypes

In [None]:
NSSCrisis_df['PostalCode'] = pd.to_numeric(NSSCrisis_df['PostalCode'], errors='coerce')

In [None]:
NSSCrisis_df = NSSCrisis_df.dropna(subset=['PostalCode'])

In [None]:
NSSCrisis_df.PostalCode.head()

In [None]:
NSSCrisis_df.PostalCode.dtype

In [None]:
#save this method ... astype int did not work
NSSCrisis_df = NSSCrisis_df.astype({'PostalCode':'int'})

In [None]:
NSSCrisis_df.PostalCode.dtype

In [None]:
zip_calls = NSSCrisis_df['PostalCode'].value_counts().sort_values(ascending=False).rename_axis('zip_code').reset_index(name='num_calls')
zip_calls.head()

In [None]:
#add zip-median income data
tn_zip_median_income = pd.read_csv('../Data/tn_median_income_zip_transpose.csv')
tn_zip_median_income.head()

In [None]:
tn_zip_median_income.drop(tn_zip_median_income.iloc[:, 2:631], inplace = True, axis = 1)
tn_zip_median_income.head()

In [None]:
zip_calls.columns

In [None]:
tn_zip_median_income.columns

In [None]:
#left join income to zip calls
zip_combined = zip_calls.merge(tn_zip_median_income, left_on='zip_code', right_on='Zip_Code', how='left')
zip_combined.drop(zip_combined.iloc[:, 2:3], inplace = True, axis = 1)
zip_combined.head()

In [None]:
#remove commas from Medium_Income
zip_combined["Median_Income"] = [str(i).replace(",", "") for i in zip_combined["Median_Income"]]
zip_combined.head()

In [None]:
zip_combined.dtypes

In [None]:
zip_combined.astype({'num_calls':'float'})
zip_combined.dtypes

In [None]:
zip_combined['log_calls'] = np.log10(zip_combined['num_calls'])

In [None]:
zip_combined.dtypes

In [None]:
#change Median_Income to int
zip_combined['Median_Income'] = pd.to_numeric(zip_combined['Median_Income'], errors = 'coerce')

In [None]:
zip_combined.dtypes

In [None]:
#create scatter plot of num_calls vs. median income for zip_combined
#percentage of songwriters not listed by decade
plt.scatter(zip_combined.Median_Income, zip_combined.log_calls, alpha=0.5, cmap="viridis")
#plt.barh(writer_listed['decade'], writer_listed['Percent_Listed_No'], color='black')
plt.ylabel('log(Call Volume)', fontsize=10, color='black', rotation=90, labelpad=15)
plt.title('Call Volume vs. Median Income \n (by Zip Code)', fontsize=12, fontname="Times New Roman Bold", color='black', pad=20)
#ax.set_title('T', pad=20)
plt.tick_params(axis='x', colors='black')
plt.tick_params(axis='y', colors='black')
plt.xlabel('Call Zip Code Median Income \n (dollars)', fontsize=11, color='black', rotation=0, labelpad=15)
plt.axvline(x=67521, color='r', linestyle='-', label=' U.S. Census \n National Household \n Median Income')
plt.legend(loc = 'upper right')
plt.savefig('median_income_v_calls_zip.png', dpi=200, transparent=False, bbox_inches='tight')
plt.show()

In [None]:
#reset axis range
plt.xlim(2, 8)
plt.ylim(-0.50,1.5)

#create scatter plot of num_calls vs. median income for zip_combined
#percentage of songwriters not listed by decade
plt.scatter(zip_combined.num_calls, zip_combined.Median_Income, alpha=0.1, cmap="viridis")
plt.xlim(0,30)
plt.ylim(0,120000)
#plt.barh(writer_listed['decade'], writer_listed['Percent_Listed_No'], color='black')
plt.xlabel('Call Volume', fontsize=11, color='black', rotation=0, labelpad=15)
plt.title('Call Volume vs. Median Income \n (by Zip Code)', fontsize=12, fontname="Times New Roman Bold", color='black', pad=20)
plt.tick_params(axis='x', colors='black')
plt.tick_params(axis='y', colors='black')
plt.ylabel('Call Zip Code Median Income \n (dollars)', fontsize=11, color='black', rotation=90, labelpad=15)
plt.axhline(y=67521, color='r', linestyle='-', label='U.S. Census National Household Median Income')
plt.legend(loc = 'upper right',)
#plt.savefig('percent_songwriter_not_listed_by_decade.png', transparent=False)
plt.show()

In [None]:
zip_combined.head(10)

In [None]:
NSSCrisis_df[NSSCrisis_df['call_']

In [None]:
#read in lat-long for zip codes
zip_lat_lng = pd.read_csv('../Data/US Zip Codes from 2013 Government Data.csv', dtype={"zip":"string"})
zip_lat_lng.head()

In [None]:
zip_combined.dtypes

In [None]:
zip_combined = zip_combined.astype({'zip_code':'str'})

In [None]:
zip_combined.dtypes

In [None]:
zip_lat_lng.dtypes

In [None]:
#join lat-lng to zip_combined
zip_combined_spat = zip_combined.merge(zip_lat_lng, left_on='zip_code', right_on='zip', how='left')
zip_combined_spat.head()

In [None]:
zip_combined_spat.drop(zip_combined_spat.iloc[:, 3:4], inplace = True, axis = 1)
zip_combined_spat.head()

In [None]:
#read in Jason's file to get calls per population normalization for each zip code
zip_pop = pd.read_csv('../data/zip_pop.csv', dtype={"zip":"string"})
zip_pop.head()

In [None]:
zip_pop.drop(zip_pop.columns[[1,2,3,4,5,6,7,9,10,11,12,13,14,15,16,17]], inplace = True, axis = 1)
#zip_pop.drop(zip_pop.iloc[:, 1:7])
zip_pop.head()

In [None]:
zip_combined_spat.columns

In [None]:
zip_combined_spat_norm = zip_combined_spat.merge(zip_pop, left_on='zip_code', right_on='zip', how='left')
zip_combined_spat_norm.head()

In [None]:
zip_combined_spat_norm.drop(zip_combined_spat_norm.columns[[5]], inplace=True, axis=1)
zip_combined_spat_norm.head()

In [None]:
zip_combined_spat_norm['norm_calls'] = (zip_combined_spat_norm['num_calls'] / zip_combined_spat_norm['population'])
zip_combined_spat_norm.head()
#sort_values(by=['col1'], ascending = False)

In [None]:
zip_combined_spat_norm.sort_values(by=['norm_calls'], ascending=False, inplace=True)
zip_combined_spat_norm.head(50)

In [None]:
zip_combined_spat_norm.replace([np.inf, -np.inf], np.nan, inplace=True)
zip_combined_spat_norm.head(75)

In [None]:
zip_combined_spat_norm.norm_calls.mean()

In [None]:
zip_combined_spat_norm.dtypes

In [None]:
#Veterans grouped by Zipcode
NSSCrisis_df.groupby(['CRISIS Demographics - Military','Year', 'PostalCode'])['CallReportNum'].count()

In [None]:
#Veterans grouped by Zipcode
NSSCrisis_df.groupby(['PostalCode'])['CallReportNum'].count()

In [None]:
NSSCrisis_df[NSSCrisis_df['PostalCode'] == '38012'].groupby(['Year'])['CallReportNum'].count()

In [None]:
#looking at percentage of calls by zipcode
zip_count = NSSCrisis_df['PostalCode'].value_counts(normalize=False).sort_values(ascending=False)
zip_count

In [None]:
#Value counts after group by example
#NSSCrisis_df.groupby('CRISIS Demographics - Education Level')['CRISIS Demographics - Occupation'].value_counts()


In [None]:
#Looking at content of military calls
NSSCrisis_df[NSSCrisis_df['CRISIS Demographics - Military'] == 'Veteran'].groupby('CRISIS Demographics - Military')['CRISIS Issues - Emotional State'].value_counts().sort_values(ascending=False)

In [None]:
#looking at homeless data
NSSCrisis_df['CRISIS Demographics - Homeless?'].value_counts()

In [None]:
#homeless military relationship
NSSCrisis_df[NSSCrisis_df['CRISIS Demographics - Homeless?'] == 'Yes'].groupby('CRISIS Demographics - Military')['CRISIS Demographics - Military'].value_counts().sort_values(ascending=False)

In [None]:
#homeless content
NSSCrisis_df[(NSSCrisis_df['CRISIS Demographics - Homeless?'] == 'Yes') & (NSSCrisis_df['CRISIS Issues - Emotional State'].str.contains('Anxious'))].groupby('CRISIS Issues - Emotional State')['CRISIS Issues - Emotional State'].count()

In [None]:
#homeless content count for Anxious
NSSCrisis_df[(NSSCrisis_df['CRISIS Demographics - Homeless?'] == 'Yes')]['CRISIS Issues - Emotional State'].str.contains('Anxious').count()

In [None]:
#homeless content count for Depressed
NSSCrisis_df[(NSSCrisis_df['CRISIS Demographics - Homeless?'] == 'Yes')]['CRISIS Issues - Emotional State'].str.contains('Depressed').count()

In [None]:
#homeless calls by month
NSSCrisis_df[NSSCrisis_df['CRISIS Demographics - Homeless?'] == 'Yes'].groupby(['Year','Month'])['CallReportNum'].count()

In [None]:
#homeless calls by month
NSSCrisis_df[NSSCrisis_df['CRISIS Demographics - Homeless?'] == 'Yes'].groupby(['Date'])['CallReportNum'].count()

### choropleth map creation

In [None]:
statemap = 'tl_2010_47_zcta510.shp'
#'../Data/tn_median_income_zip_transpose.csv'

In [None]:
zips = gpd.read_file(statemap)

In [None]:
#zips.sample(5)

In [None]:
zips.plot(figsize=(40,20))
plt.show()

In [None]:
zips.head()

In [None]:
zips.columns

In [None]:
zips.drop(['STATEFP10', 'GEOID10', 'CLASSFP10', 'MTFCC10',
       'FUNCSTAT10', 'ALAND10', 'AWATER10', 'INTPTLAT10', 'INTPTLON10',
       'PARTFLG10'], axis=1, inplace = True)

In [None]:
zip_combined_spat_norm.head()

In [None]:
#zip_combined_spat_norm.to_csv('zip_combined_spat_norm')

In [None]:
zips.columns

In [None]:
zip_combined_spat_norm.columns

In [None]:
zip_combined_spat_norm_tn = pd.read_csv('zip_combined_spat_norm_tn.csv',dtype={"zip_code":"string"})

In [None]:
zip_combined_spat_norm_tn.tail(20)

In [None]:
zip_combined_spat_norm.dtypes

In [None]:
zip_combined_spat_norm_tn.dtypes

In [None]:
merged = zips.merge(zip_combined_spat_norm_tn,
 left_on=['ZCTA5CE10'], 
 right_on=['zip_code'],
 how='left')

In [None]:
merged = merged.reset_index()

In [None]:
merged.head()

In [None]:
merged.plot(figsize=(40,20))
plt.show()

In [None]:
merged.columns

In [None]:
merged.drop(['index', 'zip_code'], axis=1, inplace=True)

In [None]:
merged.head()

In [None]:
merged[merged['norm_calls'] > .10]

In [None]:
#merged = merged.fillna(0.0)

In [None]:
merged.head()

In [None]:
zips.shape[0]

In [None]:
zip_combined_spat_norm_tn.shape[0]

In [None]:
merged.shape[0]

In [None]:
merged.norm_calls.min()

In [None]:
merged.norm_calls.mean()

In [None]:
merged.norm_calls.max()

In [None]:
#visualize with matplotlib
merged.loc[np.random.choice(merged.index, 40), 'norm_calls'] = np.nan

fig, ax = plt.subplots(1, figsize=(10,2))
ax.axis('off')
plt.rcParams['axes.facecolor'] = 'grey'
#ax.set_title('CALL VOLUME / POPULATION BY ZIP CODE', fontdict={'fontsize':'16','fontweight':'3'})
color = 'Blues'
vmin, vcenter, vmax = merged.norm_calls.min(), merged.norm_calls.mean(), merged.norm_calls.max()
norm=plt.Normalize(vmin=.001, vmax=.014)
sm = plt.cm.ScalarMappable(cmap=color, norm=norm)
sm._A = []
cbar = fig.colorbar(sm)
cbar.ax.tick_params(labelsize=10)
#ax.set_facecolor('grey')
merged.plot('norm_calls', norm=norm, cmap=color, linewidth=0.8, ax=ax, edgecolor='grey', categorical=False, legend=False, missing_kwds={'color': 'lightgrey'}, figsize=(40,20))
plt.savefig('tn_normalized_calls.png', dpi=200, bbox_inches='tight', transparent=False)

In [None]:
fig, ax = plt.subplots(figsize=(6, 1))
fig.subplots_adjust(bottom=0.5)

cmap = mpl.cm.viridis
bounds = [-1, 2, 5, 7, 12, 15]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N, extend='both')

fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
             cax=ax, orientation='horizontal',
             label="Discrete intervals with extend='both' keyword")

In [None]:
merged.to_csv('..\merged_zip_calls.csv')

In [None]:
bounds = [-1, 2, 5, 7, 12, 15]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N, extend='both')

In [None]:
merged.plot('norm_calls',cmap=color,linewidth=0.8, ax=ax, edgecolor='0.8',figsize=(40,20))

In [None]:
plt.figure(figsize=((10,8)))
plt.plot(run_before, run_after, 'ro-')
plt.plot(walk_before, walk_after, 'bo-')
plt.show()

In [None]:
#fig, ax = plt.subplots(1, figsize=(40, 20))
ax.axis('off')
ax.set_title('Heat Map of Call Volume by Zip Code', fontdict={'fontsize': '40', 'fontweight' : '3'})

In [None]:
color = 'BuPu'
vmin, vmax = .04, .25
sm = plt.cm.ScalarMappable(cmap=color, norm=plt.Normalize(vmin=vmin, vmax=vmax))
sm._A = []
cbar = fig.colorbar(sm)
cbar.ax.tick_params(labelsize=40)

In [None]:
fig, ax = plt.subplots(1, figsize=(40, 20))
merged.plot('num_calls', cmap=color, linewidth=0.8, ax=ax, edgecolor='0.8', figsize=(40,20), categorical=False, legend=True)
ax.legend(loc=(1, .5),
          fontsize=15,
          frameon=False,
          title="LEGEND")
#plot.legend(loc=2, prop={'size': 6})

In [None]:
fig, ax = plt.subplots(1, figsize=(40, 10))
mergedn.plot('norm_calls', cmap=color, linewidth=0.8, ax=ax, edgecolor='0.8', figsize=(40,20), categorical=False, legend=True)
ax.legend(loc=(1, .5),
          fontsize=15,
          frameon=False,
          title="LEGEND",
          prop={'size': 1})