## This script takes the MILQ REDCap metadata set for Denmark individuals and translates any Danish words into English. 

### print version of python

In [1]:
!python --version #3.9.7

Python 3.9.7


### install and import the googletrans library

In [14]:
!pip install googletrans
import googletrans
from googletrans import Translator



In [114]:
print(googletrans.LANGUAGES)

{'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'he': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'lo': 'lao', 'la': 'lat

In [115]:
import pandas as pd

### load in data as pandas data frame

In [149]:
df_danish = pd.read_csv('../../MILQMAINSTUDYDENMARK_DATA_2022-10-11_2253.csv', keep_default_na=False)
df_english = pd.read_csv('../../MILQMAINSTUDYDENMARK_DATA_2022-10-11_2253.csv', keep_default_na=False)

### look at the first rows of the dataframe

In [117]:
df_danish.head()

Unnamed: 0,mid,redcap_event_name,f101_cmid_q1,f101_cinfid_q1_1,f101_intid_q2,f101_doi_q3,f101_dob_q5,f101_knwtob_q5_1,f101_tob_q5_2,f101_preterm_q5_3,...,f5_time_q7___8,f5_reason_q7_1___1,f5_reason_q7_1___2,f5_reason_q7_1___3,f5_reason_q7_1___4,f5_reason_q7_1___5,f5_reason_q7_1___6,f5_other_q7_2,f5_comments_q22,f50_withdrawal_form_complete
0,MD001C,2__3_days_postpart_arm_1,MD001C,CD001C,schristensen,2018-01-09,2018-01-09,,,0.0,...,0,0,0,0,0,0,0,,,2.0
1,MD002V,2__3_days_postpart_arm_1,MD002V,CD002V,schristensen,2018-02-27,2018-02-01,1.0,13:02,0.0,...,0,0,0,0,0,0,0,,,
2,MD002V,10__349_months_arm_1,,,,,,,,,...,0,0,0,0,0,0,0,,,
3,MD002V,35__599_months_arm_1,,,,,,,,,...,0,0,0,0,0,0,0,,,
4,MD002V,60__849_months_arm_1,,,,,,,,,...,0,0,0,0,0,0,0,,,


### Read in the codebook to select only free text columns to translate

In [118]:
cbook = pd.read_excel('../../../REDCap_downloads/Denmark/MILQMAINSTUDYDENMARK_codebook.xlsx', header=0)

In [119]:
cbook.columns

Index(['#', 'Unnamed: 1', 'Unnamed: 2', 'Variable / Field Name', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Field Label\nField Note',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Field Attributes (Field Type, Validation, Choices, Calculations, etc.)',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
       'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33',
       'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37',
       'Unnamed: 38'],
      dtype='object')

In [120]:
cbook.head

<bound method NDFrame.head of                                                        # Unnamed: 1  \
0      Instrument:  F1.01 Newborn Screening Informati...        NaN   
1                                                      1        NaN   
2                                                      2        NaN   
3                                                    NaN        NaN   
4                                                    NaN        NaN   
...                                                  ...        ...   
40275                                               2313        NaN   
40276                                               2314        NaN   
40277                                                NaN        NaN   
40278                                                NaN        NaN   
40279                                                NaN        NaN   

      Unnamed: 2              Variable / Field Name  Unnamed: 4 Unnamed: 5  \
0            NaN                       

In [121]:
cbook["Field Attributes (Field Type, Validation, Choices, Calculations, etc.)"].dtype

dtype('O')

In [122]:
cbook["Field Attributes (Field Type, Validation, Choices, Calculations, etc.)"] = cbook["Field Attributes (Field Type, Validation, Choices, Calculations, etc.)"].astype(str)

In [123]:
text_fields = cbook[cbook["Field Attributes (Field Type, Validation, Choices, Calculations, etc.)"].str.contains('text')]

In [124]:
text_fields.head

<bound method NDFrame.head of           # Unnamed: 1 Unnamed: 2  \
1         1        NaN        NaN   
987       4        NaN        NaN   
988       5        NaN        NaN   
989       6        NaN        NaN   
991       8        NaN        NaN   
...     ...        ...        ...   
39202  2293        NaN        NaN   
39204  2295        NaN        NaN   
40215  2300        NaN        NaN   
40216  2301        NaN        NaN   
40274  2312        NaN        NaN   

                                   Variable / Field Name  Unnamed: 4  \
1                                                [ mid ]         NaN   
987                                    [ f101_intid_q2 ]         NaN   
988                                      [ f101_doi_q3 ]         NaN   
989                                      [ f101_dob_q5 ]         NaN   
991    [ f101_tob_q5_2 ]\nShow the !eld ONLY if: [f10...         NaN   
...                                                  ...         ...   
39202  [ f415_other_q

In [125]:
text_fields["Field Attributes (Field Type, Validation, Choices, Calculations, etc.)"].unique

<bound method Series.unique of 1                                           text, Required
987      text, Required\nField Annotation: @APPUSERNAME...
988                              text (date_dmy), Required
989      text (date_dmy, Min: 2018-01-01, Max: 2020-01-...
991                                  text (time), Required
                               ...                        
39202                                       text, Required
39204                                text (time), Required
40215    text\nField Annotation: @USERNAME @READONLY @A...
40216                            text (date_dmy), Required
40274                                       text, Required
Name: Field Attributes (Field Type, Validation, Choices, Calculations, etc.), Length: 1039, dtype: object>

In [126]:
cbook_text = cbook[(cbook['Field Attributes (Field Type, Validation, Choices, Calculations, etc.)']=='text, Required')|(cbook['Field Attributes (Field Type, Validation, Choices, Calculations, etc.)']=="text")]

In [127]:
cbook_text.head

<bound method NDFrame.head of           # Unnamed: 1 Unnamed: 2  \
1         1        NaN        NaN   
1030     13        NaN        NaN   
1033     16        NaN        NaN   
1035     18        NaN        NaN   
1036     19        NaN        NaN   
...     ...        ...        ...   
39190  2287        NaN        NaN   
39200  2291        NaN        NaN   
39201  2292        NaN        NaN   
39202  2293        NaN        NaN   
40274  2312        NaN        NaN   

                                   Variable / Field Name  Unnamed: 4  \
1                                                [ mid ]         NaN   
1030   [ f101_other_q14_1a ]\nShow the !eld ONLY if: ...         NaN   
1033   [ f101_infform_q15_1_1 ]\nShow the !eld ONLY i...         NaN   
1035   [ f101_other_q15_2_1 ]\nShow the !eld ONLY if:...         NaN   
1036   [ f101_other_q15_2_2 ]\nShow the !eld ONLY if:...         NaN   
...                                                  ...         ...   
39190  [ f415_other_q

In [128]:
cbook_text_v2 = cbook_text[["Variable / Field Name", "Field Attributes (Field Type, Validation, Choices, Calculations, etc.)"]]

In [129]:
cbook_text_v2.head

<bound method NDFrame.head of                                    Variable / Field Name  \
1                                                [ mid ]   
1030   [ f101_other_q14_1a ]\nShow the !eld ONLY if: ...   
1033   [ f101_infform_q15_1_1 ]\nShow the !eld ONLY i...   
1035   [ f101_other_q15_2_1 ]\nShow the !eld ONLY if:...   
1036   [ f101_other_q15_2_2 ]\nShow the !eld ONLY if:...   
...                                                  ...   
39190  [ f415_other_q4_2 ]\nShow the !eld ONLY if: [f...   
39200  [ f415_clinic_q7_1 ]\nShow the !eld ONLY if: [...   
39201  [ f415_hosp_q7_2 ]\nShow the !eld ONLY if: [f4...   
39202  [ f415_other_q7_3 ]\nShow the !eld ONLY if: [f...   
40274  [ f5_other_q7_2 ]\nShow the !eld ONLY if: [f5_...   

      Field Attributes (Field Type, Validation, Choices, Calculations, etc.)  
1                                         text, Required                      
1030                                                text                      
1033        

In [130]:
cbook_text_v2["Variable / Field Name"] = cbook_text_v2["Variable / Field Name"].astype(str)
cbook_text_v2["Variable / Field Name"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbook_text_v2["Variable / Field Name"] = cbook_text_v2["Variable / Field Name"].astype(str)


1                                                  [ mid ]
1030     [ f101_other_q14_1a ]\nShow the !eld ONLY if: ...
1033     [ f101_infform_q15_1_1 ]\nShow the !eld ONLY i...
1035     [ f101_other_q15_2_1 ]\nShow the !eld ONLY if:...
1036     [ f101_other_q15_2_2 ]\nShow the !eld ONLY if:...
                               ...                        
39190    [ f415_other_q4_2 ]\nShow the !eld ONLY if: [f...
39200    [ f415_clinic_q7_1 ]\nShow the !eld ONLY if: [...
39201    [ f415_hosp_q7_2 ]\nShow the !eld ONLY if: [f4...
39202    [ f415_other_q7_3 ]\nShow the !eld ONLY if: [f...
40274    [ f5_other_q7_2 ]\nShow the !eld ONLY if: [f5_...
Name: Variable / Field Name, Length: 438, dtype: object

In [131]:
cbook_text_v3 = cbook_text_v2

cbook_text_v3["extracted.variable.names"] = cbook_text_v3["Variable / Field Name"].str.extract(r'(^\[.*\])', expand=True)
cbook_text_v3["extracted.variable.names"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbook_text_v3["extracted.variable.names"] = cbook_text_v3["Variable / Field Name"].str.extract(r'(^\[.*\])', expand=True)


1                         [ mid ]
1030        [ f101_other_q14_1a ]
1033     [ f101_infform_q15_1_1 ]
1035       [ f101_other_q15_2_1 ]
1036       [ f101_other_q15_2_2 ]
                   ...           
39190         [ f415_other_q4_2 ]
39200        [ f415_clinic_q7_1 ]
39201          [ f415_hosp_q7_2 ]
39202         [ f415_other_q7_3 ]
40274           [ f5_other_q7_2 ]
Name: extracted.variable.names, Length: 438, dtype: object

In [132]:
cbook_text_v3["extracted.variable.names.v2"]=cbook_text_v3["extracted.variable.names"].str.extract(r'(f.*[0-9][a-z]*)')
cbook_text_v3["extracted.variable.names.v2"] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbook_text_v3["extracted.variable.names.v2"]=cbook_text_v3["extracted.variable.names"].str.extract(r'(f.*[0-9][a-z]*)')


1                         NaN
1030        f101_other_q14_1a
1033     f101_infform_q15_1_1
1035       f101_other_q15_2_1
1036       f101_other_q15_2_2
                 ...         
39190         f415_other_q4_2
39200        f415_clinic_q7_1
39201          f415_hosp_q7_2
39202         f415_other_q7_3
40274           f5_other_q7_2
Name: extracted.variable.names.v2, Length: 438, dtype: object

In [133]:
variables_of_interest=cbook_text_v3["extracted.variable.names.v2"].tolist()
print(variables_of_interest)

[nan, 'f101_other_q14_1a', 'f101_infform_q15_1_1', 'f101_other_q15_2_1', 'f101_other_q15_2_2', 'f101_other_q15_2_3', 'f101_other_q15_2_4', 'f101_other_q15_2_5', 'f101_other_q15_2_6', 'f101_other_q16_2', 'f101_other_q23_1', 'f101_anthropersn_q23_2', 'f101_other_q26_2', 'f102_other_q4_2', 'f102_clinic_q7_1', 'f102_hosp_q7_2', 'f102_other_q7_3', 'f003_birthmonth_q6_6', 'f003_other_q15_1', 'f003_comment_q17_1', 'f003_comment_q18_1', 'f201_other_q5_2', 'f201_specify_q6_1_1', 'f201_other_q6_2_1', 'f201_other_q6_2_2', 'f201_other_q6_2_3', 'f201_other_q6_2_4', 'f201_other_q6_2_5', 'f201_other_q7_2', 'f201_other_q8_2', 'f201_other_q12_1', 'f201_other_q14_2', 'f202_expln_q9_1', 'f202_expln_q15_1', 'f202_other_q18_1', 'f202_anthropersn_q18_2', 'f203_explain_q4_1', 'f203_explain_q5_1', 'f204_sptreatmt_q5_1_4', 'f204_sptreatmt_q5_2_4', 'f204_sptreatmt_q5_3_4', 'f204_sptreatmt_q5_4_4', 'f204_sptreatmt_q5_5_4', 'f204_sptreatmt_q5_6_4', 'f204_sptreatmt_q5_7_4', 'f204_specify_q5_8a', 'f204_sptreatmt_q5

### remove "nan" that's at the start from the list of strings

In [134]:
variables_of_interest = variables_of_interest[1:]
print(variables_of_interest)

['f101_other_q14_1a', 'f101_infform_q15_1_1', 'f101_other_q15_2_1', 'f101_other_q15_2_2', 'f101_other_q15_2_3', 'f101_other_q15_2_4', 'f101_other_q15_2_5', 'f101_other_q15_2_6', 'f101_other_q16_2', 'f101_other_q23_1', 'f101_anthropersn_q23_2', 'f101_other_q26_2', 'f102_other_q4_2', 'f102_clinic_q7_1', 'f102_hosp_q7_2', 'f102_other_q7_3', 'f003_birthmonth_q6_6', 'f003_other_q15_1', 'f003_comment_q17_1', 'f003_comment_q18_1', 'f201_other_q5_2', 'f201_specify_q6_1_1', 'f201_other_q6_2_1', 'f201_other_q6_2_2', 'f201_other_q6_2_3', 'f201_other_q6_2_4', 'f201_other_q6_2_5', 'f201_other_q7_2', 'f201_other_q8_2', 'f201_other_q12_1', 'f201_other_q14_2', 'f202_expln_q9_1', 'f202_expln_q15_1', 'f202_other_q18_1', 'f202_anthropersn_q18_2', 'f203_explain_q4_1', 'f203_explain_q5_1', 'f204_sptreatmt_q5_1_4', 'f204_sptreatmt_q5_2_4', 'f204_sptreatmt_q5_3_4', 'f204_sptreatmt_q5_4_4', 'f204_sptreatmt_q5_5_4', 'f204_sptreatmt_q5_6_4', 'f204_sptreatmt_q5_7_4', 'f204_specify_q5_8a', 'f204_sptreatmt_q5_8_4'

### Running a loop function that performs google translate on each column of the dataframe that contains "free" text responses.

In [136]:
df_english["f101_infform_q15_1_1"].unique()

array(['', '30 ml right after birth', 'NA', 'første 2 dage', 'NNN', 'MME',
       'Formula', 'Amount: as a medicin glas',
       '20 ml + 40 ml on two seperate day within the first week', '15 ml',
       '6 hours after birth', '1. og 2. dagen', 'first 4 days',
       'Første par dage', 'Gulsot', 'First 3 days', '10 ml',
       '<10 ml day 3 after birth', 'First days at hospital', '999',
       'infant went to neonatal unit due to amniotic fluid in the lungs. received breast milk right after birth, but had formula the first hours afterwards on neonatal unit',
       'very few ml after birth'], dtype=object)

In [151]:
for variable in variables_of_interest:
    from googletrans import Translator
    translator = Translator()
    if len(df_english[variable].unique().tolist())<2:
        print("skipped:", variable)
        continue
    print("translated:", variable)
    df_english[variable] = df_english[variable].apply(translator.translate, src='auto', dest='en').apply(getattr, args=('text',)) 
    print(df_english[variable].unique())

skipped: f101_other_q14_1a
translated: f101_infform_q15_1_1
['' '30 ml right after birth' 'THAT' 'first 2 days' 'NNN' 'MME' 'Formula'
 'Amount: as a medicine glass'
 '20 ml + 40 ml on two separate day within the first week' '15 ml'
 '6 hours after birth' '1st and 2nd day' 'first 4 days' 'First few days'
 'Jaundice' 'First 3 days' '10 ml' '<10 ml day 3 after birth'
 'First days at hospital' '999'
 'infant went to neonatal unit due to amniotic fluid in the lungs. received breast milk right after birth, but had formula the first hours afterwards on neonatal unit'
 'very few ml after birth']
translated: f101_other_q15_2_1
['' 'sugar water' '3 drops sugar water' 'Donor milk' 'water' 'donormælk']
skipped: f101_other_q15_2_2
skipped: f101_other_q15_2_3
skipped: f101_other_q15_2_4
skipped: f101_other_q15_2_5
skipped: f101_other_q15_2_6
translated: f101_other_q16_2
['' 'Anterior tongue tie' 'arises tight tongue tie - sucking technique'
 'tongue tie (cut on 4th of july)'
 "At first lethargic, di

ProtocolError: Invalid input ConnectionInputs.RECV_PING in state ConnectionState.CLOSED

### we hit a character translation limit at "f203_explain_q4_1" so picking up from there. 

In [182]:
variables_of_interest.index("f203_explain_q4_1")
variables_of_interest_part2 = variables_of_interest[35:]
print(variables_of_interest_part2)

['f203_explain_q4_1', 'f203_explain_q5_1', 'f204_sptreatmt_q5_1_4', 'f204_sptreatmt_q5_2_4', 'f204_sptreatmt_q5_3_4', 'f204_sptreatmt_q5_4_4', 'f204_sptreatmt_q5_5_4', 'f204_sptreatmt_q5_6_4', 'f204_sptreatmt_q5_7_4', 'f204_specify_q5_8a', 'f204_sptreatmt_q5_8_4', 'f204_sptreatmt_q6_1_1d', 'f204_sptreatmt_q6_1_2d', 'f204_sptreatmt_q6_1_3d', 'f204_sptreatmt_q6_1_4d', 'f204_sptreatmt_q6_2_1d', 'f204_sptreatmt_q6_2_2d', 'f204_sptreatmt_q6_2_3d', 'f204_sptreatmt_q6_2_4d', 'f204_sptreatmt_q6_3_1d', 'f204_sptreatmt_q6_3_2d', 'f204_sptreatmt_q6_3_3d', 'f204_sptreatmt_q6_3_4d', 'f204_sptreatmt_q6_4_1d', 'f204_sptreatmt_q6_4_2d', 'f204_sptreatmt_q6_4_3d', 'f204_sptreatmt_q6_4_4d', 'f204_sptreatmt_q6_5_1d', 'f204_sptreatmt_q6_5_2d', 'f204_sptreatmt_q6_5_3d', 'f204_sptreatmt_q6_5_4d', 'f204_sptreatmt_q6_6_1d', 'f204_sptreatmt_q6_6_2d', 'f204_sptreatmt_q6_6_3d', 'f204_sptreatmt_q6_6_4d', 'f204_sptreatmt_q6_7_1d', 'f204_sptreatmt_q6_7_2d', 'f204_sptreatmt_q6_7_3d', 'f204_sptreatmt_q6_7_4d', 'f204_o

In [183]:
for variable in variables_of_interest_part2:
    from googletrans import Translator
    translator = Translator()
    if len(df_english[variable].unique().tolist())<2:
        print("skipped:", variable)
        continue
    print("translated:", variable)
    df_english[variable] = df_english[variable].apply(translator.translate, src='auto', dest='en').apply(getattr, args=('text',)) 
    print(df_english[variable].unique())

translated: f203_explain_q4_1
['' 'Missing data' 'no data']
translated: f203_explain_q5_1
['' 'Missing data' 'No data' 'no data' 'missing data']
skipped: f204_sptreatmt_q5_1_4
skipped: f204_sptreatmt_q5_2_4
translated: f204_sptreatmt_q5_3_4
['' 'panodil']
skipped: f204_sptreatmt_q5_4_4
skipped: f204_sptreatmt_q5_5_4
translated: f204_sptreatmt_q5_6_4
['' 'panodil']
skipped: f204_sptreatmt_q5_7_4
translated: f204_specify_q5_8a
['' 'fungus in diaper area' 'sponge' 'Swollen testicle, right side'
 'Stuffy nose' 'thresh' 'A little stomach ache' 'eye infection perhaps'
 'Gas' 'eye infection' 'Stomach ache' 'eye inflammation'
 'Eye inflammation']
translated: f204_sptreatmt_q5_8_4
['' 'mycostatin (sweet liquid 4x daily' 'mycostatin']
skipped: f204_sptreatmt_q6_1_1d
skipped: f204_sptreatmt_q6_1_2d
skipped: f204_sptreatmt_q6_1_3d
skipped: f204_sptreatmt_q6_1_4d
skipped: f204_sptreatmt_q6_2_1d
skipped: f204_sptreatmt_q6_2_2d
skipped: f204_sptreatmt_q6_2_3d
skipped: f204_sptreatmt_q6_2_4d
skipped: 

['' 'Participates in another study with blood sampling'
 'The minipuberty cohort' 'Child had vaccine same day' 'THAT']
skipped: f209_clinic_q7_1
skipped: f209_hosp_q7_2
skipped: f209_other_q7_3
translated: f209_other_q9_2


ReadTimeout: The read operation timed out

### we hit a character translation limit at "f209_other_q9_2" so picking up from there. 

In [184]:
variables_of_interest.index("f209_other_q9_2")

135

In [185]:
variables_of_interest_part3 = variables_of_interest[135:]
print(variables_of_interest_part3)

['f209_other_q9_2', 'f211_other_q4_2', 'f211_clinic_q7_1', 'f211_hosp_q7_2', 'f211_other_q7_3', 'f214_other_q4_2', 'f214_clinic_q7_1', 'f214_hosp_q7_2', 'f214_other_q7_3', 'f215_other_q4_2', 'f215_clinic_q7_1', 'f215_hosp_q7_2', 'f215_other_q7_3', 'f301_other_q5_2', 'f301_other_q6_2', 'f301_other_q7_2', 'f302_expln_q6_1', 'f302_expln_q12_1', 'f302_other_q15_1', 'f302_anthropersn_q15_2', 'f303_explain_q5_1', 'f303_explain_q6_1', 'f303_explain_q7_1', 'f303_explain_q8_1', 'f304_sptreatmt_q5_1_4', 'f304_sptreatmt_q5_2_4', 'f304_sptreatmt_q5_3_4', 'f304_sptreatmt_q5_4_4', 'f304_sptreatmt_q5_5_4', 'f304_sptreatmt_q5_6_4', 'f304_sptreatmt_q5_7_4', 'f304_specify_q5_8a', 'f304_sptreatmt_q5_8_4', 'f304_sptreatmt_q6_1_1d', 'f304_sptreatmt_q6_1_2d', 'f304_sptreatmt_q6_1_3d', 'f304_sptreatmt_q6_1_4d', 'f304_sptreatmt_q6_2_1d', 'f304_sptreatmt_q6_2_2d', 'f304_sptreatmt_q6_2_3d', 'f304_sptreatmt_q6_2_4d', 'f304_sptreatmt_q6_3_1d', 'f304_sptreatmt_q6_3_2d', 'f304_sptreatmt_q6_3_3d', 'f304_sptreatmt_q6

In [187]:
for variable in variables_of_interest_part3:
    from googletrans import Translator
    translator = Translator()
    if len(df_english[variable].unique().tolist())<2:
        print("skipped:", variable)
        continue
    print("translated:", variable)
    df_english[variable] = df_english[variable].apply(translator.translate, src='auto', dest='en').apply(getattr, args=('text',)) 
    print(df_english[variable].unique())

translated: f209_other_q9_2
['' "Mother's milk"]
translated: f211_other_q4_2
['' 'not enough feces']
skipped: f211_clinic_q7_1
skipped: f211_hosp_q7_2
skipped: f211_other_q7_3
translated: f214_other_q4_2
['' 'Fecal contamination of sample' 'No reason given' 'feces on sample'
 'contamination with feces' 'contaminated with feces'
 'Faeces on cotton balls']
skipped: f214_clinic_q7_1
skipped: f214_hosp_q7_2
skipped: f214_other_q7_3
skipped: f215_other_q4_2
skipped: f215_clinic_q7_1
skipped: f215_hosp_q7_2
skipped: f215_other_q7_3
translated: f301_other_q5_2
['' 'Child was sick for 1 month and then not so interested']
translated: f301_other_q6_2
['' 'Low production of milk between 6 pm and 8 pm'
 'Child deny breastfeeding several times a day'
 'The mother had doubts about whether there was enough. The child has become very upset during breastfeeding. Breastfeeds are very short. But the problems are abating.'
 "Baby doesn't always want the breast."
 'Willing, not complaining. Every day a lit

['' '6 drops/daily' '5 drops' '7 drops' '5' '6 drops' '25' '0.25 g']
skipped: f304_other_q10_6a
skipped: f304_brndty_q10_6_1a
skipped: f304_dosesz_q10_6_2a
translated: f304_spother_q11_9a
['' 'Hepatitis B'
 'rotarix (per oral, 2 vaccines; d. 02-01-2019 and 06-02-2019)'
 'Bacille-Calmette-Guérin, BCG (tuberculosis)'
 'Rotorix Wed 9 Wed 17-06-2019+14 Wed 15-7-2019']
translated: f304_treatment1_q13_3
['' 'no treatment' 'No treatment, was in contact with a doctor' 'no'
 'pain reliever' 'Dicillin (2 tablets x 3/day' 'No treatment']
translated: f304_treatment2_q13_6
['' '12-07-2018']
skipped: f304_treatment3_q13_9
skipped: f304_treatment4_q13_12
skipped: f304_sptreatmt_q15_1_4
skipped: f304_sptreatmt_q15_2_4
skipped: f304_sptreatmt_q15_3_4
skipped: f304_sptreatmt_q15_4_4
skipped: f304_sptreatmt_q15_5_4
skipped: f304_sptreatmt_q15_6_4
translated: f304_other_q15_7a
['' 'allergy' 'Influenza' 'sore throat' 'Headache']
translated: f304_sptreatmt_q15_7_4
['' 'nasal spray']
translated: f304_lstcont

['' 'Lactulose' 'Creme' 'Brentan' 'Penicillin' 'creme']
skipped: f404_sptreatmt_q6_1_1d
skipped: f404_sptreatmt_q6_1_2d
skipped: f404_sptreatmt_q6_1_3d
skipped: f404_sptreatmt_q6_1_4d
translated: f404_sptreatmt_q6_2_1d
['' 'cliche']
skipped: f404_sptreatmt_q6_2_2d
skipped: f404_sptreatmt_q6_2_3d
skipped: f404_sptreatmt_q6_2_4d
translated: f404_sptreatmt_q6_3_1d
['' 'Antibiotic, cough mixture']
skipped: f404_sptreatmt_q6_3_2d
skipped: f404_sptreatmt_q6_3_3d
skipped: f404_sptreatmt_q6_3_4d
skipped: f404_sptreatmt_q6_4_1d
skipped: f404_sptreatmt_q6_4_2d
skipped: f404_sptreatmt_q6_4_3d
skipped: f404_sptreatmt_q6_4_4d
translated: f404_sptreatmt_q6_5_1d
['' 'Panodil' 'panodil']
skipped: f404_sptreatmt_q6_5_2d
skipped: f404_sptreatmt_q6_5_3d
skipped: f404_sptreatmt_q6_5_4d
translated: f404_sptreatmt_q6_6_1d
['' 'panodil']
skipped: f404_sptreatmt_q6_6_2d
skipped: f404_sptreatmt_q6_6_3d
skipped: f404_sptreatmt_q6_6_4d
translated: f404_sptreatmt_q6_7_1d
['' 'panodil']
skipped: f404_sptreatmt_q6_

ConnectError: [Errno 54] Connection reset by peer

### we hit a character translation limit at "f404_other_q9_6a" so picking up from there.

In [188]:
variables_of_interest.index("f404_other_q9_6a")

358

In [189]:
variables_of_interest_part4 = variables_of_interest[358:]
print(variables_of_interest_part4)

['f404_other_q9_6a', 'f404_spname_q9_6_6', 'f404_spdose_q9_6_8', 'f404_other_q9_8a', 'f404_spname_q9_8_6', 'f404_spdose_q9_8_8', 'f404_brndty_q10_2_1a', 'f404_dosesz_q10_2_2a', 'f404_brndty_q10_3_1a', 'f404_dosesz_q10_3_2a', 'f404_brndty_q10_4_1a', 'f404_dosesz_q10_4_2a', 'f404_brndty_q10_5_1a', 'f404_dosesz_q10_5_2a', 'f404_other_q10_6a', 'f404_brndty_q10_6_1a', 'f404_dosesz_q10_6_2a', 'f404_spother_q11_8a', 'f404_treatment1_q13_3', 'f404_treatment2_q13_6', 'f404_treatment3_q13_9', 'f404_treatment4_q13_12', 'f404_sptreatmt_q15_1_4', 'f404_sptreatmt_q15_2_4', 'f404_sptreatmt_q15_3_4', 'f404_sptreatmt_q15_4_4', 'f404_sptreatmt_q15_5_4', 'f404_sptreatmt_q15_6_4', 'f404_other_q15_7a', 'f404_sptreatmt_q15_7_4', 'f404_lstcontra_q16_1', 'f404_other_q17_2', 'f406_bflgth_q7_1', 'f406_agemnths_q11_2_1a', 'f406_agemnths_q11_5_1a', 'f406_other_q11_7_1', 'f406_agemnths_q11_7_2a', 'f406_agemnths_q12_1_1a', 'f406_agemnths_q12_1_2a', 'f406_agemnths_q12_1_3a', 'f406_agemnths_q12_1_4a', 'f406_agemnths_

In [210]:
for variable in variables_of_interest_part4:
    from googletrans import Translator
    translator = Translator()
    if len(df_english[variable].unique().tolist())<2:
        print("skipped:", variable)
        continue
    print("translated:", variable)
    df_english[variable] = df_english[variable].apply(translator.translate, src='auto', dest='en').apply(getattr, args=('text',)) 
    print(df_english[variable].unique())

translated: f404_other_q9_6a
['' 'Fucidin creme']
translated: f404_spname_q9_6_6
['' 'Fucidin']
translated: f404_spdose_q9_6_8
['' '5 times a week 1/4 pea']
skipped: f404_other_q9_8a
skipped: f404_spname_q9_8_6
skipped: f404_spdose_q9_8_8
skipped: f404_brndty_q10_2_1a
skipped: f404_dosesz_q10_2_2a
skipped: f404_brndty_q10_3_1a
skipped: f404_dosesz_q10_3_2a
translated: f404_brndty_q10_4_1a
['' 'He supported' 'he supported' 'Producer: scan-medic Skanderborg'
 'Lactocare' 'Multiform' 'Livol' 'Approved'
 'D-Vitamin Dråber S.A. (produced by scan-medic Skanderborg)' 'Biogaia'
 'Vigantolettes' 'biogaya' 'duolac' 'livol'
 'Semper D drops with lactic acid' 'lactocare' 'Apovit + Livol' 'Duolack'
 'Durolac']
translated: f404_dosesz_q10_4_2a
['' '5 drops' '5 drops/day' '6 drops' '5 drops daily' '5' '500 I.E.'
 '5-6 drops' '5.5' '5 drops a day' '5 drop']
translated: f404_brndty_q10_5_1a
['' 'Biogaia' 'biogaya' 'duolac' 'Semper D drops' 'lactocare' 'Lactocare'
 'Duolack' 'Durolac']
translated: f404_

ReadTimeout: The read operation timed out

### confirm some of the translations

In [221]:
print(df_danish["f101_infform_q15_1_1"].unique())
print(df_english["f101_infform_q15_1_1"].unique())

print(df_danish["f204_other_q6_8a"].unique())
print(df_english["f204_other_q6_8a"].unique())

print(df_danish["f301_other_q6_2"].unique())
print(df_english["f301_other_q6_2"].unique())

print(df_danish["f409_other_4_2"].unique())
print(df_english["f409_other_4_2"].unique())

['' '30 ml right after birth' 'NA' 'første 2 dage' 'NNN' 'MME' 'Formula'
 'Amount: as a medicin glas'
 '20 ml + 40 ml on two seperate day within the first week' '15 ml'
 '6 hours after birth' '1. og 2. dagen' 'first 4 days' 'Første par dage'
 'Gulsot' 'First 3 days' '10 ml' '<10 ml day 3 after birth'
 'First days at hospital' '999'
 'infant went to neonatal unit due to amniotic fluid in the lungs. received breast milk right after birth, but had formula the first hours afterwards on neonatal unit'
 'very few ml after birth']
['' '30 ml right after birth' 'THAT' 'first 2 days' 'NNN' 'MME' 'Formula'
 'Amount: as a medicine glass'
 '20 ml + 40 ml on two separate day within the first week' '15 ml'
 '6 hours after birth' '1st and 2nd day' 'first 4 days' 'First few days'
 'Jaundice' 'First 3 days' '10 ml' '<10 ml day 3 after birth'
 'First days at hospital' '999'
 'infant went to neonatal unit due to amniotic fluid in the lungs. received breast milk right after birth, but had formula the firs

### write the translated dataframe to file

In [222]:
df_english.to_csv(path_or_buf="translated_Denmark_REDCap_metadata.csv", sep=",", header=True)