<a href="https://colab.research.google.com/github/Hubert26/suicides_IPPAN/blob/main/data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Liblaries and settings

In [71]:
import pandas as pd
import numpy as np

#Ustawienie braku maksymalnej ilości wyświetlanych kolumn
pd.options.display.max_columns = None

import os
import shutil
from google.colab import files
from google.colab import drive
#drive.mount('/content/drive')

np.random.seed(42)

#Function definitions

## display_info

In [72]:
def display_info(df, selected_columns):
    print(selected_columns)
    print(len(selected_columns))
    for col_name in selected_columns:

        # Mode
        print("\nMode:")
        print(df[col_name].mode())

        # NaN count
        nan_count = df[col_name].isna().sum()
        print("\nNaN count:", nan_count)

        # Unique values len
        print("\nUnique values len:")
        print(len(df[col_name].unique()))

        # Unique values
        print("\nUnique values:")
        print(df[col_name].unique())

        # Value counts
        print("\nValue counts:")
        print(df[col_name].value_counts())

        print("\n" + "=" * 40)  # Separator between columns


## create_bar_plot

In [73]:
def create_bar_plot(x_values, y_values, x_label, y_label, title, x_rotation=0):

    # Tworzenie wykresu słupkowego
    plt.bar(x_values, y_values)

    # Dodanie wartości nad słupkami
    for i in range(len(x_values)):
        plt.text(x_values[i], y_values[i], str(y_values[i]), ha='center', va='bottom')

    # Dodanie tytułu i etykiet osi
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    # Wyświetlenie wykresu
    plt.xticks(rotation=x_rotation, ha='right')
    plt.tight_layout()
    plt.show()


##create_double_bar_plot

In [74]:
def create_double_bar_plot(x_values, y_values1, y_values2, x_label, y_label, title, x_rotation=0):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))  # Tworzenie dwóch subplotów obok siebie

    # Utworzenie słupków dla obu serii danych
    bars1 = ax1.bar(x_values, y_values1, label='Series 1')
    bars2 = ax2.bar(x_values, y_values2, label='Series 2')

    # Dodanie wartości nad słupkami dla serii 1
    for bar in bars1:
        height = bar.get_height()
        ax1.annotate('{}'.format(height),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

    # Dodanie wartości nad słupkami dla serii 2
    for bar in bars2:
        height = bar.get_height()
        ax2.annotate('{}'.format(height),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

    # Dodanie tytułów i etykiet osi
    ax1.set_title(title)
    ax1.set_xlabel(x_label)
    ax1.set_ylabel(y_label)
    ax1.set_xticklabels(x_values, rotation=x_rotation, ha='right')

    ax2.set_xlabel(x_label)
    ax2.set_ylabel(y_label)
    ax2.set_xticklabels(x_values, rotation=x_rotation, ha='right')

    # Wyświetlenie wykresu
    plt.tight_layout()
    plt.show()

##get_non_selected_columns_for_age_group

In [75]:
def get_non_selected_columns_for_age_group(age_group, non_selected_columns_list):
    # Iteracja przez listę tupli non_selected_columns_list
    for tup in non_selected_columns_list:
        # Sprawdzenie, czy pierwszy element krotki jest równy age_group
        if tup[0] == age_group:
            # Jeśli tak, zwróć drugi element krotki (czyli listę kolumn)
            return tup[1]

    # Jeśli nie znaleziono pasującej krotki, zwróć pustą listę
    return []

#Data exploration

In [76]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/Hubert26/suicides_IPPAN/main/data/out_preped_suicides.csv', delimiter=',', low_memory=False, index_col=False, dtype={'DateY': str, 'DateM': str,})



In [77]:
df_raw.head()

Unnamed: 0,Income,Age1,Fatal,Method,DateM,Gender,Education,AbuseInfo,DateY,ID_samobójcy,WorkInfo,Substance,Age2,Date,Place,CountContext,Marital,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss
0,,,,,1,M,,,2013,11458155200,,,,1.2013,,0,,0,0,0,0,0,0,0,0,0,0
1,,13-18,1.0,Vehicle,1,M,,,2013,11458428400,Student,,00-18,1.2013,Road,0,,0,0,0,0,0,0,0,0,0,0
2,Dependent,13-18,1.0,Jumping,1,F,Primary,,2013,11458429300,Student,,00-18,1.2013,Other,0,Single,0,0,0,0,0,0,0,0,0,0
3,,19-24,1.0,Other,1,M,,,2013,11458431200,,,19-34,1.2013,,1,Single,0,0,0,0,1,0,0,0,0,0
4,,40-44,1.0,Hanging,1,M,,,2013,11458439900,Unemployed,Alco,35-64,1.2013,UtilitySpaces,0,Single,0,0,0,0,0,0,0,0,0,0


In [78]:
context_columns = [col for col in df_raw.columns if col.startswith('Context_')]

In [79]:
df_reasons_of_attack = df_raw[context_columns]

In [80]:
df_raw.drop(columns=context_columns, inplace=True)

In [81]:
df_reasons_of_attack.head()

Unnamed: 0,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0


In [82]:
df_reasons_of_attack.shape

(128332, 10)

In [83]:
df_data = df_raw.copy()

In [84]:
df_data['CountContext'] = df_data['CountContext'].replace(0, np.nan)

##NaN exploration in variables

In [85]:
df_data.shape

(128332, 17)

In [86]:
df_data['Fatal'].sum()

59770.0

In [87]:
df_data[df_data['ID_samobójcy'].duplicated()].ID_samobójcy #Sprawdzenie duplikatów

118796    NaN
Name: ID_samobójcy, dtype: object

In [88]:
##Brakujące dane
total = df_data.isnull().sum().sort_values(ascending=False)
percent = 100*(df_data.isnull().sum()/df_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [89]:
missing_data

Unnamed: 0,Total,Percent
AbuseInfo,101644,79.20394
Education,87707,68.343827
Substance,65004,50.652994
WorkInfo,62970,49.068042
Income,55697,43.400711
CountContext,45667,35.585045
Marital,16237,12.652339
Age1,1296,1.009881
Age2,1296,1.009881
Place,107,0.083377


##NaN exploration in rows

In [90]:
nan_counts = df_data.isna().sum(axis=1).value_counts()

In [91]:
full_index = list(range(0, len(df_data.columns.tolist()) + 1))

# Zsumowanie istniejących wartości z pełną listą i ustawienie brakujących wartości na 0
nan_counts = nan_counts.reindex(full_index, fill_value=0)

# Sortowanie wyników rosnąco
nan_counts = nan_counts.sort_index()

In [92]:
nan_counts_proc = (nan_counts / len(df_data)) * 100

In [93]:
missing_data_rows = pd.concat([nan_counts, nan_counts_proc], axis=1, keys=['Total', 'Percent'])
missing_data_rows

Unnamed: 0,Total,Percent
0,4357,3.3951
1,18007,14.031574
2,21767,16.961475
3,22285,17.365115
4,24178,18.840196
5,19789,15.42016
6,12988,10.120625
7,4438,3.458218
8,180,0.140261
9,265,0.206496


In [94]:
# Usunięcie wierszy są NaN
selected_index = df_data[df_data['Gender'].isna() | df_data['Age1'].isna()].index
selected_index

Index([     0,     15,     19,     25,     26,     29,     35,     37,     42,
           57,
       ...
       112789, 112929, 117635, 122238, 123012, 123529, 125135, 125375, 128320,
       128331],
      dtype='int64', length=1296)

In [95]:
df_data[df_data['Age1'].isna()]

Unnamed: 0,Income,Age1,Fatal,Method,DateM,Gender,Education,AbuseInfo,DateY,ID_samobójcy,WorkInfo,Substance,Age2,Date,Place,CountContext,Marital
0,,,,,01,M,,,2013,11458155200,,,,1.2013,,,
15,,,,,01,M,,,2013,11458585800,,,,1.2013,,,
19,NoSteady,,1.0,Hanging,01,M,,,2013,11458623400,Employed,Sober,,1.2013,House,1.0,Cohabitant
25,Steady,,1.0,Hanging,01,M,Secondary,,2013,11458645800,Employed,,,1.2013,,1.0,Single
26,,,1.0,Vehicle,01,F,,,2013,11458649700,,,,1.2013,Railway,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123529,,,1.0,Jumping,08,M,,,2023,139313240,,,,8.2023,WaterRes,,
125135,,,1.0,Vehicle,10,M,,,2023,139345889,,,,10.2023,Railway,,
125375,,,1.0,Hanging,10,M,,,2023,139350851,,,,10.2023,Forest,,
128320,,,1.0,Hanging,12,M,,,2023,139400519,,,,12.2023,Forest,,


In [96]:
df_data[df_data['Gender'].isna()]

Unnamed: 0,Income,Age1,Fatal,Method,DateM,Gender,Education,AbuseInfo,DateY,ID_samobójcy,WorkInfo,Substance,Age2,Date,Place,CountContext,Marital
6393,,,1.0,Hanging,10.0,,,,2013.0,11492252400,,,,10.2013,Forest,,
48570,,,1.0,Hanging,11.0,,,,2017.0,12381541300,,,,11.2017,Other,,
69214,,,1.0,Hanging,8.0,,,,2019.0,12615124500,,,,8.2019,Forest,,
69694,,,1.0,Hanging,9.0,,,,2019.0,12619944600,,,,9.2019,Forest,,
78431,,,1.0,Hanging,6.0,,,,2020.0,12790279200,,,,6.202,Other,,
98145,,,1.0,Hanging,12.0,,,,2021.0,13373734400,,,,12.2021,Other,,
128331,,,,,,,,,,15133 - Liczba osób w zamachach samobójczych o...,,,,,,,


In [97]:
# Usunięcie wierszy są NaN
selected_index = df_data[df_data['Gender'].isna() | df_data['Age1'].isna()].index

df_data = df_data.drop(index=selected_index)
df_reasons_of_attack = df_reasons_of_attack.drop(index=selected_index)

In [98]:
df_data = df_data.dropna(subset=['Date'])

#Fill NaN in age groups

In [99]:
df_data["AbuseInfo"].fillna("Not", inplace=True)

In [100]:
age_groups = sorted(list(set(df_data['Age1'])))
columns = df_data.columns.to_list()

In [101]:
age_groups

['07-12',
 '13-18',
 '19-24',
 '25-29',
 '30-34',
 '35-39',
 '40-44',
 '45-49',
 '50-54',
 '55-59',
 '60-64',
 '65-69',
 '70-74',
 '75-79',
 '80-84',
 '85+']

In [102]:
accept_probability = (0,75)
cleaned_suicides_data = pd.DataFrame()

for age_group in age_groups:

  data_age_group_selected_index = df_data.loc[df_data['Age1'] == age_group].index
  data_age_group = df_data.loc[data_age_group_selected_index]
  data_age_group_female_index = data_age_group.loc[data_age_group['Gender'] == 'F'].index
  data_age_group_male_index = data_age_group.loc[data_age_group['Gender'] == 'M'].index

  earlier_age_group = age_groups[age_groups.index(age_group) - 1] if age_group != age_groups[0] else None
  later_age_group = age_groups[age_groups.index(age_group) + 1] if age_group != age_groups[-1] else None

  earlier_later_selected_index = df_data.loc[df_data['Age1'].isin([age_group] + [earlier_age_group] + [later_age_group])].index
  earlier_later_df = df_data.loc[earlier_later_selected_index]
  earlier_later_female_index = earlier_later_df.loc[earlier_later_df['Gender'] == 'F'].index
  earlier_later_male_index = earlier_later_df.loc[earlier_later_df['Gender'] == 'M'].index

  total = data_age_group.isnull().sum().sort_values(ascending=False)
  percent = 100*(data_age_group.isnull().sum()/data_age_group.isnull().count()).sort_values(ascending=False)
  missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
  selected_columns = list(set(columns).intersection(set(missing_data[(missing_data['Percent'] < accept_probability[1]) & (missing_data['Percent'] > accept_probability[0])].index)))
 # non_selected_columns = list(set(columns).intersection(set(missing_data[(missing_data['Percent'] >= accept_probability[1])].index)))

  for variable in df_data.columns.tolist():
    if variable in selected_columns:
      data_to_probability_selected_index = data_age_group_selected_index
      data_to_probability = data_age_group
      data_to_probability_female_index = data_age_group_female_index
      data_to_probability_male_index = data_age_group_male_index
      print("variable IS IN selected_columns")
    else:
      data_to_probability_selected_index = earlier_later_selected_index
      data_to_probability = earlier_later_df
      data_to_probability_female_index = earlier_later_female_index
      data_to_probability_male_index = earlier_later_male_index
      print("variable IS NOT IN selected_columns")
    if not data_to_probability_female_index.empty:
      value_counts_result = data_to_probability.loc[data_to_probability_female_index, variable].dropna().value_counts()
      if value_counts_result.empty:
        print("FEMALE")
        print("value_counts_result:", value_counts_result)
        print("variable:", variable)
        print("age_group:", age_group)
        print("earlier_age_group:", earlier_age_group)
        print("later_age_group:", later_age_group)
        #print("data_to_probability.loc[data_to_probability_female_index, variable]:\n", data_to_probability.loc[data_to_probability_female_index, variable])
      else:
        data_to_probability_female_serie_values = value_counts_result.values.tolist()
        data_to_probability_female_serie = value_counts_result.index.tolist()
        female_probabilities = [x / sum(data_to_probability_female_serie_values) for x in data_to_probability_female_serie_values]
        data_age_group_female_null_index = data_age_group_female_index[data_age_group.loc[data_age_group_female_index, variable].isnull()]

        if not data_age_group_female_null_index.empty:
          data_age_group.loc[data_age_group_female_null_index, variable] = np.random.choice(data_to_probability_female_serie, size=len(data_age_group_female_null_index.tolist()), p=female_probabilities)

    if not data_to_probability_male_index.empty:
      value_counts_result = data_to_probability.loc[data_to_probability_male_index, variable].dropna().value_counts()
      if value_counts_result.empty:
        print("MALE")
        print("value_counts_result:", value_counts_result)
        print("variable:", variable)
        print("age_group:", age_group)
        print("earlier_age_group:", earlier_age_group)
        print("later_age_group:", later_age_group)
      else:
        data_to_probability_male_serie_values = value_counts_result.values.tolist()
        data_to_probability_male_serie = value_counts_result.index.tolist()
        male_probabilities = [x / sum(data_to_probability_male_serie_values) for x in data_to_probability_male_serie_values]

        data_age_group_male_null_index = data_age_group_male_index[data_age_group.loc[data_age_group_male_index, variable].isnull()]
        if not data_age_group_male_null_index.empty:
          data_age_group.loc[data_age_group_male_null_index, variable] = np.random.choice(data_to_probability_male_serie, size=len(data_age_group_male_null_index.tolist()), p=male_probabilities)

    # data_age_group_reasons_of_attack
    if variable == 'CountContext':
      data_age_group_reasons_of_attack = df_reasons_of_attack.loc[data_age_group_selected_index]
      data_to_probability_reasons_of_attack = df_reasons_of_attack.loc[data_to_probability_selected_index]

      serie_values_reasons_of_attack = data_age_group_reasons_of_attack.columns.tolist()

      # FEMALE
      data_to_probability_female_values = data_to_probability_reasons_of_attack.loc[data_to_probability_female_index, serie_values_reasons_of_attack].sum().tolist()
      female_probabilities = [x / sum(data_to_probability_female_values) for x in data_to_probability_female_values]
      data_age_group_female_null_index = data_age_group_female_index[data_age_group.loc[data_age_group_female_index, variable].isnull()]

      # Lista do przechowywania wylosowanych wartości dla każdej wartości w zmiennej
      selected_values = []

      # Iteracja przez wartości w zmiennej
      for value in data_age_group.loc[data_age_group_female_null_index, variable]:
          # Losowanie wartości zgodnie z podanymi prawdopodobieństwami
          selected_value = np.random.choice(serie_values_reasons_of_attack, size=int(value), p=female_probabilities)
          selected_values.append(selected_value)
      # Iteracja przez indeksy i wartości w kolumnie reason_of_attack_count
      for idx, selected_value in zip(data_age_group_female_null_index, selected_values):
          # Iteracja przez wylosowane wartości
          for column_name in selected_value:
              # Ustawienie wartości 1 w odpowiednich kolumnach
              data_age_group_reasons_of_attack.loc[idx, column_name] = 1

      #MALE
      data_to_probability_male_values = data_to_probability_reasons_of_attack.loc[data_to_probability_male_index, serie_values_reasons_of_attack].sum().tolist()
      male_probabilities = [x / sum(data_to_probability_male_values) for x in data_to_probability_male_values]
      data_age_group_male_null_index = data_age_group_male_index[data_age_group.loc[data_age_group_male_index, variable].isnull()]

      # Lista do przechowywania wylosowanych wartości dla każdej wartości w zmiennej
      selected_values = []

      # Iteracja przez wartości w zmiennej
      for value in data_age_group.loc[data_age_group_male_null_index, variable]:
          # Losowanie wartości zgodnie z podanymi prawdopodobieństwami
          selected_value = np.random.choice(serie_values_reasons_of_attack, size=int(value), p=male_probabilities)
          selected_values.append(selected_value)
      # Iteracja przez indeksy i wartości w kolumnie reason_of_attack_count
      for idx, selected_value in zip(data_age_group_male_null_index, selected_values):
          # Iteracja przez wylosowane wartości
          for column_name in selected_value:
              # Ustawienie wartości 1 w odpowiednich kolumnach
              data_age_group_reasons_of_attack.loc[idx, column_name] = 1

  combined_df = pd.concat([data_age_group, data_age_group_reasons_of_attack], axis=1)
  cleaned_suicides_data = pd.concat([cleaned_suicides_data, combined_df], axis=0)
  # file_name = 'data_age_group_' + age_group + '.csv'
  # combined_df.to_csv(file_name, index=False)
  # files.download(file_name)

variable IS IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS IN selected_columns
variable IS IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS IN selected_columns
variable IS IN selected_columns
variable IS IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS NOT IN selected_columns
variable IS IN selected_columns
variable IS IN selected_colu

In [103]:
cleaned_suicides_data

Unnamed: 0,Income,Age1,Fatal,Method,DateM,Gender,Education,AbuseInfo,DateY,ID_samobójcy,WorkInfo,Substance,Age2,Date,Place,CountContext,Marital,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss
2707,Dependent,07-12,0.0,Self-harm,05,F,Primary,Not,2013,11477120400,Student,Alco,00-18,5.2013,Forest,1.0,Single,0,0,0,0,0,0,0,0,1,0
3918,Dependent,07-12,0.0,Other,06,F,Pre-primary,Not,2013,11481530100,Student,Sober,00-18,6.2013,Forest,1.0,Single,0,0,1,0,0,0,0,0,0,0
6711,Dependent,07-12,1.0,Hanging,10,F,Pre-primary,Not,2013,11493674400,Student,Sober,00-18,10.2013,House,1.0,Single,0,0,0,0,0,0,0,0,0,0
6871,Dependent,07-12,0.0,Other,10,M,Primary,Not,2013,11494487000,Student,Sober,00-18,10.2013,Other,1.0,Single,0,0,1,0,0,0,0,0,0,0
6949,Dependent,07-12,1.0,Hanging,10,F,Pre-primary,Not,2013,11494823000,Student,Sober,00-18,10.2013,House,1.0,Single,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127946,Benefits,85+,0.0,Drugs,12,M,Secondary,Not,2023,139395339,Unemployed,OtherSub,65+,12.2023,House,1.0,Widowed,0,0,1,0,0,0,0,0,0,0
127971,Benefits,85+,1.0,Hanging,12,M,Primary,Not,2023,139395490,Unemployed,Sober,65+,12.2023,Institution,1.0,Widowed,0,0,0,0,0,0,0,0,0,0
128056,Benefits,85+,1.0,Hanging,12,M,Vocational,Not,2023,139396228,Unemployed,Sober,65+,12.2023,House,1.0,Married,0,0,0,0,0,0,0,1,0,0
128131,Benefits,85+,1.0,Hanging,12,M,Vocational,Not,2023,139396639,Agriculturalist,Sober,65+,12.2023,House,1.0,Married,0,0,0,0,0,0,0,0,0,0


###Checking

['07-12',
 '13-18',
 '19-24',
 '25-29',
 '30-34',
 '35-39',
 '40-44',
 '45-49',
 '50-54',
 '55-59',
 '60-64',
 '65-69',
 '70-74',
 '75-79',
 '80-84',
 '85+']

sorted(list(set(df_data['Wiek'])))

['K', 'M']

[2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

sorted(list(set(df_data['Data_rok'])))

In [104]:
age_group = sorted(list(set(df_data['Age1'])))
gender = ['F', 'M']
year = sorted(list(set(df_data['DateY'])))

In [105]:
df_data_check_1 = df_data[df_data['Age1'].isin(age_group) & df_data['Gender'].isin(gender) & df_data['DateY'].isin(year)]

In [106]:
df_data_check_2 = cleaned_suicides_data[cleaned_suicides_data['Age1'].isin(age_group) & df_data['Gender'].isin(gender) & df_data['DateY'].isin(year)]

  df_data_check_2 = cleaned_suicides_data[cleaned_suicides_data['Age1'].isin(age_group) & df_data['Gender'].isin(gender) & df_data['DateY'].isin(year)]


In [107]:
df_data_check_1.shape

(127034, 17)

In [108]:
df_data_check_2.shape

(127034, 27)

In [109]:
df_data_check_1["Education"].value_counts()

Education
Secondary      12553
Primary        12347
Vocational     10467
Higher          3071
Pre-primary     1961
Name: count, dtype: int64

In [110]:
df_data_check_2["Education"].value_counts()

Education
Secondary      39456
Vocational     38603
Primary        33942
Higher         11664
Pre-primary     3369
Name: count, dtype: int64

In [111]:
##Brakujące dane
total = df_data_check_1.isnull().sum().sort_values(ascending=False)
percent = 100*(df_data_check_1.isnull().sum()/df_data_check_1.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [112]:
missing_data.head(5)

Unnamed: 0,Total,Percent
Education,86635,68.198278
Substance,64019,50.39517
WorkInfo,62091,48.877466
Income,54815,43.149865
CountContext,44990,35.415715


In [113]:
##Brakujące dane
total = df_data_check_2.isnull().sum().sort_values(ascending=False)
percent = 100*(df_data_check_2.isnull().sum()/df_data_check_2.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [114]:
missing_data.head(5)

Unnamed: 0,Total,Percent
Income,0,0.0
Place,0,0.0
Context_Other,0,0.0
Context_MentalHealth,0,0.0
Context_SchoolWork,0,0.0


#Zapis

In [115]:
  file_name = 'out_exploration_suicides.csv'
  cleaned_suicides_data.to_csv(file_name, index=False)
  files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>