In [5]:
import pandas as pd

survey_df = pd.read_csv('survey.csv')

In [9]:
survey_df.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64

In [11]:
survey_df.drop(columns=['comments'], inplace=True)


In [13]:
survey_df['state'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  survey_df['state'].fillna('Unknown', inplace=True)


In [15]:
us_states = ['California', 'New York', 'Texas', 'Florida', 'Illinois', 'Ohio', 'Washington', 'Georgia', 'Pennsylvania', 'Michigan']


In [17]:
survey_df['Country'] = survey_df['Country'].replace(us_states, 'United States')


In [19]:
survey_df.drop(columns=['state'], inplace=True)


In [21]:
survey_df['Country'].value_counts().head(10)


Country
United States     752
United Kingdom    185
Canada             72
Germany            45
Ireland            27
Netherlands        27
Australia          21
France             13
India              10
New Zealand         8
Name: count, dtype: int64

In [23]:
survey_df['work_interfere'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  survey_df['work_interfere'].fillna('Unknown', inplace=True)


In [25]:
survey_df['work_interfere'] = survey_df['work_interfere'].fillna('Unknown')


In [27]:
interfere_map = {
    'Never': 0,
    'Rarely': 1,
    'Sometimes': 2,
    'Often': 3,
    'Unknown': -1  # sau np.nan dacă vrei să excludem din calcule
}

survey_df['work_interfere_score'] = survey_df['work_interfere'].map(interfere_map)


In [29]:
survey_df[['work_interfere', 'work_interfere_score']].head(10)
survey_df['work_interfere_score'].value_counts().sort_index()


work_interfere_score
-1    264
 0    213
 1    173
 2    465
 3    144
Name: count, dtype: int64

In [31]:
survey_df['diagnosed'] = survey_df['diagnosed'].str.lower().map({'yes': 1, 'no': 0})


KeyError: 'diagnosed'

In [33]:
print(survey_df.columns.tolist())



['Timestamp', 'Age', 'Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview', 'mental_vs_physical', 'obs_consequence', 'work_interfere_score']


In [35]:
survey_df['treatment'] = survey_df['treatment'].str.lower().map({'yes': 1, 'no': 0})


In [7]:
survey_df['treatment'] = survey_df['treatment'].str.strip().str.lower().map({'yes': 1, 'no': 0})


In [9]:
survey_df['self_employed'] = survey_df['self_employed'].str.strip().str.lower().map({
    'yes': 1,
    'no': 0
})
survey_df['self_employed'] = survey_df['self_employed'].fillna(-1)


In [11]:
survey_df['remote_work'] = survey_df['remote_work'].str.strip().str.lower().map({'yes': 1, 'no': 0})


In [13]:
survey_df['tech_company'] = survey_df['tech_company'].str.strip().str.lower().map({'yes': 1, 'no': 0})


In [15]:
survey_df['family_history'] = survey_df['family_history'].str.strip().str.lower().map({'yes': 1, 'no': 0})


In [17]:
survey_df['benefits'] = survey_df['benefits'].str.strip().str.lower().map({
    'yes': 1,
    'no': 0,
    'don’t know': -1,
    "don't know": -1  # în caz că apar ambele versiuni
})


In [19]:
survey_df['care_options'] = survey_df['care_options'].str.strip().str.lower().map({
    'yes': 1,
    'no': 0,
    'not sure': -1
})


In [21]:
survey_df['seek_help'] = survey_df['seek_help'].str.strip().str.lower().map({
    'yes': 1,
    'no': 0,
    'don’t know': -1,
    "don't know": -1
})


In [23]:
survey_df['anonymity'] = survey_df['anonymity'].str.strip().str.lower().map({
    'yes': 1,
    'no': 0,
    'don’t know': -1,
    "don't know": -1
})


In [25]:
survey_df['mental_vs_physical'] = survey_df['mental_vs_physical'].str.strip().str.lower().map({
    'yes': 1,
    'no': 0,
    'don’t know': -1,
    "don't know": -1
})


In [27]:
def standardize_gender(g):
    g = str(g).strip().lower()
    if g in ['male', 'm', 'man', 'cis male', 'male-ish']:
        return 'Male'
    elif g in ['female', 'f', 'woman', 'cis female', 'femail']:
        return 'Female'
    else:
        return 'Other'

survey_df['gender_simplified'] = survey_df['Gender'].apply(standardize_gender)


In [29]:
leave_map = {
    'Very easy': 3,
    'Somewhat easy': 2,
    'Somewhat difficult': 1,
    'Very difficult': 0,
    "Don't know": -1,
    "Don’t know": -1
}

survey_df['leave_score'] = survey_df['leave'].map(leave_map)


In [31]:
survey_df.to_csv('survey_prelucrat.csv', index=False)
