In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re

In [2]:
pd.set_option('max_columns', None) # copy this line of code into the cell where we want to display all columns

# Explore Survey Data

In [3]:
data_dir = "~/Documents/CompCulture/Collabera/Data"
fn = os.path.join(data_dir, "Collabera_Survey_Responses_all.csv")
survey_df = pd.read_csv(fn, index_col=['ExternalReference'], header=0,
                        usecols=['ExternalReference', 'Progress', 'Duration (in seconds)', 'Finished', 'ResponseId',
               'LocationLatitude', 'LocationLongitude', 'Q35_1', 'Q35_2', 'Q35_3', 'Q35_4', 'Q35_5', 'Q35_6',
               'Q40', 'Q41', 'Q34_1', 'Q34_2', 'Q34_3', 'Q34_4', 'Q34_5', 'Q34_6',
               'Q36', 'Q37', 'Q38'])
survey_df = survey_df[2:]
survey_df = survey_df.rename(columns={"Q35_1":"mael_1", "Q35_2":"mael_2", "Q35_3":"mael_3",
                                      "Q35_4":"mael_4", "Q35_5":"mael_5", "Q35_6":"mael_6",
                                      "Q40":"bergami_org", "Q41":"bergami_dept",
                                      "Q34_1":"disengagement_1", "Q34_2":"exhaustion_1", "Q34_3":"exhaustion_2",
                                      "Q34_4":"exhaustion_3", "Q34_5":"disengagement_2", "Q34_6":"disengagement_3",
                                      "Q36":"pros", "Q37":"cons", "Q38":"story"})
survey_df = survey_df.astype({'Progress':'int32'})
survey_df = survey_df.loc[survey_df.index.notna()]

In [4]:
def numerify(df, col_name, new_col=None, likert=5):
    if not new_col:
        new_col = col_name
    if likert == 5:
        df.loc[df[col_name] == 'Strongly agree', new_col] = 5
        df.loc[df[col_name] == 'Somewhat agree', new_col] = 4
        df.loc[df[col_name] == 'Neither agree nor disagree', new_col] = 3
        df.loc[df[col_name] == 'Somewhat disagree', new_col] = 2
        df.loc[df[col_name] == 'Strongly disagree', new_col] = 1
    elif likert == 4:
        df.loc[df[col_name] == 'Strongly agree', new_col] = 4
        df.loc[df[col_name] == 'Agree', new_col] = 3
        df.loc[df[col_name] == 'Disagree', new_col] = 2
        df.loc[df[col_name] == 'Strongly disagree', new_col] = 1
    return df

def letter_to_number(df, col_name, new_col=None):
    if not new_col:
        new_col = col_name
    letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
    for i in range(8):
        df.loc[df[col_name] == letters[i], new_col] = i + 1
    return df

In [5]:
for s in range(1, 7):
    survey_df = numerify(survey_df, 'mael_'+str(s), likert=5)
for s in range(1, 4):
    survey_df = numerify(survey_df, 'disengagement_'+str(s), likert=4)
    survey_df = numerify(survey_df, 'exhaustion_'+str(s), likert=4)

In [6]:
survey_df = letter_to_number(survey_df, 'bergami_org', 'bergami_org_num')
survey_df = letter_to_number(survey_df, 'bergami_dept', 'bergami_dept_num')

In [7]:
survey_df['mael_avg'] = survey_df.apply(lambda row : (row['mael_1'] + row['mael_2'] + row['mael_3'] + row['mael_4'] + row['mael_5'] + row['mael_6'])/6,
                                        axis=1)

# Explore HR Data

In [8]:
hr = os.path.join(data_dir, "Collabera_HR_Perf.csv")
hr_df = pd.read_csv(hr, index_col=['UID'])

In [9]:
set(['U'+str(i) for i in range(1, 1751)]) - (set(hr_df.index.to_list()))
# Figuring out why six observations are missing if we drop NA (the number of rows change from 1727 to 1721 if we run hr_df.drop_na())
temp_hr = hr_df.drop(['2019 Performance', '2020 Performance'], axis=1)
temp_hr[temp_hr.isnull().any(axis=1)] # EEO Code missing for six observations
from collections import Counter
# used the line below to repeatedly investigate unique values of each column
sorted(Counter(hr_df['Department'].to_list()).items(), key=lambda x: x[1], reverse=True)

Counter(hr_df['2020 Performance'].to_list())

Counter({'Achieved': 846, nan: 733, 'Not Achieved': 146, 'Not Applicable': 2})

## Fixing HR Data Issues

In [10]:
# Expand state abbreviations discovered while exploring each column in the cell above
hr_df.loc[hr_df['Work State'] == 'CO', 'Work State'] = 'Colorado'
hr_df.loc[hr_df['Work State'] == 'WI', 'Work State'] = 'Wisconsin'
hr_df.loc[hr_df['Work State'] == 'NC', 'Work State'] = 'North Carolina'
# Create extra dummy for Collabera emps, excluding those working for Cognixia and Webxl
hr_df['Collabera'] = hr_df['Legal Entity Name'].apply(lambda s : 1 if 'Collabera' in s else 0)
# Fix race data: change nan entries to missing, collapse all Hispanic or Latino into Hispanic or Latino 
hr_df['Race'] = hr_df['EEO Code']
hr_df.loc[hr_df['Race'].isnull(), 'Race'] = 'Missing'
hr_df.loc[hr_df['Race'].str.contains('Hispanic or Latino'), 'Race'] = 'Hispanic or Latino'
hr_df.loc[hr_df['Race'] == 'Race missing or unknown', 'Race'] = 'Missing'
# Division, department, and function all look fine - no typos or mistakes

# Changing rows with Not Applicable into NAN to allow for future dropping of NAN performance entries
hr_df.loc[hr_df['2019 Performance'].isnull(), '2019 Performance'] = 'Not Applicable'
hr_df.loc[hr_df['2020 Performance'].isnull(), '2020 Performance'] = 'Not Applicable'

hr_df['2019_perf_dummy'] = np.nan
hr_df['2020_perf_dummy'] = np.nan
hr_df.loc[hr_df['2019 Performance'] == 'Not Achieved', '2019_perf_dummy'] = 0
hr_df.loc[hr_df['2020 Performance'] == 'Not Achieved', '2020_perf_dummy'] = 0

hr_df.loc[hr_df['2019 Performance'] == 'Achieved', '2019_perf_dummy'] = 1
hr_df.loc[hr_df['2020 Performance'] == 'Achieved', '2020_perf_dummy'] = 1

In [11]:
survey_hr_df = survey_df.join(hr_df, how='inner')

In [12]:
print(survey_hr_df.dropna(subset=['2020_perf_dummy']).loc[survey_hr_df['Progress'] == 100,].shape)
print(survey_hr_df.dropna(subset=['2020_perf_dummy']).shape)

(713, 45)
(852, 45)


In [14]:
survey_hr_df.to_csv('~/Documents/CompCulture/spacespace/Coco/analyses_data/preprocessed_survey_hr.csv', index_label='uid')

# Exploring and Coding Survey Responses

In [None]:
pd.set_option('display.max_colwidth', 0)
answers = []
num_responses = 50
survey_text_df = survey_hr_df.dropna(subset=['pros', 'cons', 'story']).astype({'pros':'str',
                             'cons':'str',
                             'story':'str'})
sample_df = survey_text_df[['Gender', 'Race', 'mael_avg', 'bergami_org', 'bergami_dept', 'pros','cons','story']].sample(num_responses)

In [None]:
for i in range(num_responses):
    display(sample_df.iloc[[i]])
    inp = input("Notes:\t")
    answers.append(inp)


In [None]:
sample_df['Coding Notes'] = answers
sample_df.to_csv("~/Documents/CompCulture/spacespace/Coco/analyses_data/sampled_responses_notes_09302020.csv")

In [None]:
pd.set_option('display.max_colwidth', 0)
survey_text_df.reindex(survey_text_df['bergami_org'].sort_values(ascending=False).index)[['Gender', 'Race','mael_avg', 'bergami_org', 'pros','cons','story']]

Quick thoughts from random coding of 50 responses:
Potentially identify topics, such as growth, family, that predict identification? Not finding linguistic signature of identification but what sort of topics predict identification?

Quick thoughts from coding top 30 and bottom 30 in bergami org identification:
Mark individuals who have repeated responses
A lot of references to family
Lack of personal life - contributes to identification?
Memories
ownership: Collabera has given me the best platform to grow and analyze my skills to the top level. Also it has helped me become a better person. For me Collabera is the my very own company and I see ways and out how to make myself most useful in the growth of the company. Collabera's culture and environment has given me the vibes, that I am a part of it and has recognized me to the top level.
is similarity betweeen survey response and official company language a measure of identification? Some people have really absorbed the 'we are the best' sentiment
doesn't seem to be enough signal in survey responses especially when they are short.  Some poeple make it clear that they identify: e.g., "We at Collabera follow our culture aggressively. Work Hard Play Hard and Insanely competitive are the best which defines us to the core. Nothing is negative here - People are positive so do our Company :)". But a lack of we- language doesn't necessarily mean low identification
Similarity between response and average response predicts identification? the idea that the more you embody the language of everyone, the more you identify?
Growth seems to be a pro for people who identify and dont identify


# Micellaneous

Some of these analyses I ran here using log_we_they and log_we_i are marginally significant (log_we_they/mael and log_we_they/bergami_org) if using survey_df instead of survey_hr_df. These columns have been removed so that downstream consequences can use them more freely. The results also depend on which rows are dropped and which rows are kept - dropping rows only missing in text data or dropping rows missing in any data. As these correlations are preliminary and not accounting for any control variables, the analyses should be examined more carefully in R later.

In [None]:
# Great example
pd.set_option('display.max_colwidth', 0)

survey_text_df.loc['U169', ['pros', 'cons', 'story']]