In [1]:
import sklearn
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import json
from itertools import combinations

In [51]:
df = pd.read_json('data/dk_asylum_data.json', encoding='utf-8')
df.head()

Unnamed: 0,id,asylum_motive,country,text,hasText
0,0,[Bortfald],Irak,Nævnet omgjorde i marts 2022 Udlænding...,True
1,1,"[Konsekvensstatus, Sur Place, Politiske forh...",Tyrkiet,Nævnet meddelte i marts 2022 opholdsti...,True
2,2,"[Sur Place, Inddragelse/forlængelse, Generel...",Syrien,Nævnet omgjorde i marts 2022 Udlænding...,True
3,3,"[Udsendelse (§ 49 a m.v.), Øvrige modsætnings...",Tchad,Nævnet stadfæstede i marts 2022 Udlænd...,True
4,4,"[Sur Place, Politiske forhold]",Iran,Nævnet meddelte i marts 2022 opholdsti...,True


The following list of asylum motives is extracted from the search page of the data set at https://fln.dk/da/Praksis. There might be additional motives that were added at a later point.

In [52]:
motives = ["Agents of Persecution",
"Anden kønsrelateret forfølgelse",
"Asylsagsproceduren",
"Chikane",
"Etniske forhold",
"Familier med børn",
"Familierelationer og ægteskabslignende forhold",
"Generelle forhold",
"Helbredsmæssige forhold",
"Kriminelle forhold",
"Kønsrelateret forfølgelse",
"LGBT",
"Militære forhold",
"Modtageforhold",
"Nationalitet",
"Overgreb",
"Politiske forhold",
"Privatretlig forhold",
"Privatretlig konflikt",
"Religiøse forhold",
"Seksuelle forhold",
"Tilbageholdelse",
"Tilknyt bistand oprørsgruppe",
"Tortur",
"Udrejseforhold",
"Uforholdsmæssig straf",
"Ægteskabelige forhold",
"Ændrede forhold",
"Øvrige modsætningsforhold til myndighederne"
]

gender_motives = ["Anden kønsrelateret forfølgelse",
                  "Kønsrelateret forfølgelse",
                  "LGBT",
                  "Seksuelle forhold",
                  "Ægteskabelige forhold"
                    ]

Extracts the asylum motives from the column "asylum_motives" and turns the list into columns, one column for each asylum motive

In [53]:
df_motives = pd.DataFrame(columns=gender_motives)

def mark_motive(text, str_id):
    new_list = [item.strip() for item in text]
    for item in new_list:
        if item in gender_motives:
            df_motives.loc[str_id, item] = 1.0
    df_motives.loc[str_id, 'text'] = df.loc[str_id, 'text']


for index, row in df.iterrows():
    mark_motive(row['asylum_motive'], index)


print("Shape of the df with motives as columns", df_motives.shape) 

Shape of the df with motives as columns (9301, 6)


In [54]:
def get_number_of_cases_with2_gender_asyl_motives():
    two_motives = df_motives[gender_motives].copy()
    two_motives['count'] = two_motives.sum(axis=1)    
    two_motives =two_motives[two_motives['count'] > 1.0]
    return len(two_motives)

print("Number of cases with 2 gender asylum motives:", get_number_of_cases_with2_gender_asyl_motives())

Number of cases with 2 gender asylum motives: 164


# Topics Analysis

In [55]:
def get_num_word(df, word):
    df[word] = np.where(df.text.str.contains(word), 1, 0)
    return df[word].sum()

In [56]:
col_rename = {"Kønsrelateret forfølgelse": "Gender-related persecution",
             "LGBT": "LGBT",
             "Seksuelle forhold": "Sexual conditions",
             "Ægteskabelige forhold": "Marital conditions",
             "Anden kønsrelateret forfølgelse": "Other gender-related persecution",
                   }
words = [
    'tvangsgift',
    'homoseks',
    'lesbisk',
    'prostitution',
    'voldt',
    'enlig',
    'transkøn',
    'abort',
    'sterilis',
    'biseks',
    'transvest',
    'menneskehandel',
    'traffick',
    'omskåret',
    'omskæring'
]

grouping = {
    'Homosexuality': ['homoseks', 'lesbisk'],
    'Human trafficking': ['menneskehandel', 'traffick'],
    'Female genital mutilation': ['omskåret', 'omskæring'],
    'Forced marriage': ['tvangsgift'],
    'Forced prostitution': ['prostitution'],
    'Rape': ['voldt'],
    'Transgenderism': ['transkøn'],
    'Forced abortion': ['abort'],
    'Forced sterilisation': ['sterilis'],
    'Bisexuality': ['biseks'],
    'Transvestism': ['transvest']
}


In [59]:
def get_count_per_word(word, cat):
    df_motives[word] = np.where(df_motives.text.str.contains(word), 1, 0)

def group_topic(cat, word_list):
    df_count = df_motives[df_motives[cat] == 1.0][word_list]
    return df_count.any(axis=1).sum()

def make_analysis_table():
    index_topics = grouping.keys()
    df_res = pd.DataFrame(columns=gender_motives, index=index_topics)
    for cat in gender_motives:
        for word in words:
            get_count_per_word(word, cat)
        for topic, word_list in grouping.items():
            df_res.at[topic, cat] = group_topic(cat, word_list)
    df_res.rename(columns=col_rename, inplace=True)
    return df_res

df_topic_count = make_analysis_table()
df_topic_count

Unnamed: 0,Other gender-related persecution,Gender-related persecution,LGBT,Sexual conditions,Marital conditions
Homosexuality,2,29,192,46,6
Human trafficking,21,17,3,7,2
Female genital mutilation,116,23,0,12,31
Forced marriage,26,22,1,26,75
Forced prostitution,8,14,4,8,2
Rape,35,51,26,99,54
Transgenderism,0,0,5,0,1
Forced abortion,3,3,2,15,19
Forced sterilisation,2,3,0,1,0
Bisexuality,0,1,34,6,0


In [66]:
def make_topic_modelling_table():
    cols = df_topic_count.columns
    df_res_relative = pd.DataFrame(columns=cols, index=grouping.keys())
    df_res_relative[cols] = df_topic_count[cols].div(df_topic_count[cols].sum(axis=0), axis=1)
    df_res_relative['Total'] = df_topic_count[cols].sum(axis = 1)
    df_res_relative.loc['Total'] = df_topic_count.sum()
    return df_res_relative

df_topic_model = make_topic_modelling_table()

print("Topic analysis table with percentages:")
df_topic_model

Topic analysis table with percentages:


Unnamed: 0,Other gender-related persecution,Gender-related persecution,LGBT,Sexual conditions,Marital conditions,Total
Homosexuality,0.00939,0.177914,0.719101,0.209091,0.031579,275.0
Human trafficking,0.098592,0.104294,0.011236,0.031818,0.010526,50.0
Female genital mutilation,0.544601,0.141104,0.0,0.054545,0.163158,182.0
Forced marriage,0.122066,0.134969,0.003745,0.118182,0.394737,150.0
Forced prostitution,0.037559,0.08589,0.014981,0.036364,0.010526,36.0
Rape,0.164319,0.312883,0.097378,0.45,0.284211,265.0
Transgenderism,0.0,0.0,0.018727,0.0,0.005263,6.0
Forced abortion,0.014085,0.018405,0.007491,0.068182,0.1,42.0
Forced sterilisation,0.00939,0.018405,0.0,0.004545,0.0,6.0
Bisexuality,0.0,0.006135,0.127341,0.027273,0.0,41.0


In [68]:
def save_topics_table():
    #Round to two digits to print nicely
    df_save = df_topic_model.copy()
    df_save = df_save.astype(float)
    df_save.to_csv('data/topic_modelling.csv')

save_topics_table()

# Co-occurrence Matrix

Create dataframe to make co-occurrence matrix from:

In [80]:
df_motives_all = pd.DataFrame(columns=motives)

def mark_motive(text, str_id):
    new_list = [item.strip() for item in text]
    for item in new_list:
        if item in motives:
            df_motives_all.loc[str_id, item] = 1.0
        else:
            print("Found new asylum motive, not in the list provided by this notebook:", item)


for index, row in df.iterrows():
    mark_motive(row['asylum_motive'], index)
    
print("Shape of df with all asylum motives:", df_motives_all.shape) 

Shape of df with all asylum motives: (8967, 29)


In [81]:
def get_co_occ_matrix():
    df_motives_all.T.dot(df_motives_all)

co_matrix = get_co_occ_matrix()
co_matrix.to_csv('data/co_matrix.csv')

Unnamed: 0,Agents of Persecution,Anden kønsrelateret forfølgelse,Asylsagsproceduren,Chikane,Etniske forhold,Familier med børn,Familierelationer og ægteskabslignende forhold,Generelle forhold,Helbredsmæssige forhold,Kriminelle forhold,...,Religiøse forhold,Seksuelle forhold,Tilbageholdelse,Tilknyt bistand oprørsgruppe,Tortur,Udrejseforhold,Uforholdsmæssig straf,Ægteskabelige forhold,Ændrede forhold,Øvrige modsætningsforhold til myndighederne
Agents of Persecution,,,,,,,,,,,...,,,,,,,,,,
Anden kønsrelateret forfølgelse,,,,,,,,,,,...,,,,,,,,,,
Asylsagsproceduren,,,,,,,,,,,...,,,,,,,,,,
Chikane,,,,,,,,,,,...,,,,,,,,,,
Etniske forhold,,,,,,,,,,,...,,,,,,,,,,
Familier med børn,,,,,,,,,,,...,,,,,,,,,,
Familierelationer og ægteskabslignende forhold,,,,,,,,,,,...,,,,,,,,,,
Generelle forhold,,,,,,,,,,,...,,,,,,,,,,
Helbredsmæssige forhold,,,,,,,,,,,...,,,,,,,,,,
Kriminelle forhold,,,,,,,,,,,...,,,,,,,,,,
