* [Chapter 0 - Libraries and Imports](#chapter0)
* [Chapter 0.5 - Time-to-retraction with RWD](#chapter0.5)
* [Chapter 1 - Basic Time-to-Retraction Statistics](#chapter1)
* [Chapter 2 - Temporal trends](#chapter2)
* [Chapter 3 - Reasons for Retraction](#chapter3)
* [Chapter 4 - Geographical Differences](#chapter4)
* [Chapter 5 - Analysis per Journal](#chapter5)
* [Chapter 6 - Analysis per Scientific area](#chapter6) 

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 0 - Libraries and Imports <a class="anchor" id="chapter0"></a>

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
control_set_import = pd.read_parquet('./retractions_data/control_set.parquet')
retractions_import = pd.read_excel('./retractions_data/processed_data_retractions.xlsx')

In [None]:
control_set = control_set_import.copy()
retractions = retractions_import.copy()

In [None]:
control_set.info()

In [None]:
retractions.info()

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 0.5 - Time-to-retraction with RWD <a class="anchor" id="chapter0.5"></a>

In [None]:
rw = pd.read_excel('./retractions_data/retraction_watch_database.xlsx')
rw.head()

In [None]:
rw.info()

In [None]:
rw['RetractionDate'] = pd.to_datetime(rw['RetractionDate'], errors='coerce') #, infer_datetime_format=True
rw['OriginalPaperDate'] = pd.to_datetime(rw['OriginalPaperDate'])

In [None]:
rw['ArticleType'].unique()

In [None]:
article_type = rw[['Record ID', 'ArticleType']].copy()
article_type.loc[:, 'ArticleType'] = article_type['ArticleType'].str.split(';')
article_type = article_type.explode('ArticleType')
article_type = article_type[article_type['ArticleType'].str.strip() != ""]
article_type = article_type[article_type['ArticleType'].str.strip() != ";"]
article_type = article_type['ArticleType'].str.strip()
article_type

In [None]:
article_type.unique()

In [None]:
select_article_type = ['Research Article', 'Clinical Study', 'Commentary/Editorial',
       'Review Article', 'Meta-Analysis', 'Case Report',
       'Article in Press', 'Preprint', 
       'Retraction Notice', 'Retracted Article',
       'Correction/Erratum/Corrigendum', 'Government Publication',
       'Supplementary Materials'] 
filtered_rw = rw[rw['ArticleType'].str.contains('|'.join(select_article_type))]

In [None]:
filtered_rw

In [None]:
filtered_rw['Journal'].value_counts()[:50]

In [None]:
len(filtered_rw['Journal'].unique())

In [None]:
subject = filtered_rw[['Record ID','Subject']]
subject.loc[:, 'Subject'] = subject['Subject'].str.split(';')
subject = subject.explode('Subject')
subject = subject[subject['Subject'].str.strip() != ""]
subject

In [None]:
subject['Biomedicine'] = np.where(subject['Subject'].str.contains('(HSC)'), 1,0)

biomed_fields = ['(BLS) Genetics', '(BLS) Biology - Molecular', '(BLS) Biochemistry', '(BLS) Biology - Cancer',
                '(BLS) Microbiology', '(BLS) Toxicology', '(BLS) Neuroscience', '(BLS) Anatomy/Physiology']

subject['Biomedicine'] = np.where(subject['Subject'].isin(biomed_fields), 1, subject['Biomedicine'])

In [None]:
subject.groupby('Record ID')['Biomedicine'].sum().value_counts()

In [None]:
filtered_rw = filtered_rw.merge(subject.groupby('Record ID')['Biomedicine'].sum(), how='left', on = 'Record ID')

In [None]:
filtered_rw['Biomedicine'] = np.where(filtered_rw['Biomedicine']>=1, 1, 0)

In [None]:
filtered_rw

In [None]:
# Average number of months until retraction
avg_ttr_total = np.mean((filtered_rw['RetractionDate']-filtered_rw['OriginalPaperDate']) / pd.Timedelta(days=30.4375))
avg_ttr_total

In [None]:
avg_ttr_biomed = np.mean((filtered_rw[filtered_rw['Biomedicine']==1]['RetractionDate']-filtered_rw[filtered_rw['Biomedicine']==1]['OriginalPaperDate']) / pd.Timedelta(days=30.4375))
avg_ttr_biomed

In [None]:
avg_ttr_non_biomed = np.mean((filtered_rw[filtered_rw['Biomedicine']==0]['RetractionDate']-filtered_rw[filtered_rw['Biomedicine']==0]['OriginalPaperDate']) / pd.Timedelta(days=30.4375))
avg_ttr_non_biomed

In [None]:
avg_ttr = ["average time-to-retraction", avg_ttr_total, avg_ttr_biomed, avg_ttr_non_biomed]

In [None]:
# Average number of months until retraction
median_ttr_total = np.median((filtered_rw['RetractionDate']-filtered_rw['OriginalPaperDate']) / pd.Timedelta(days=30.4375))
median_ttr_total

In [None]:
((filtered_rw['RetractionDate']-filtered_rw['OriginalPaperDate']) / pd.Timedelta(days=30.4375)).describe()

In [None]:
median_ttr_biomed = np.median((filtered_rw[filtered_rw['Biomedicine']==1]['RetractionDate']-filtered_rw[filtered_rw['Biomedicine']==1]['OriginalPaperDate']) / pd.Timedelta(days=30.4375))
median_ttr_biomed

In [None]:
median_ttr_non_biomed = np.median((filtered_rw[filtered_rw['Biomedicine']==0]['RetractionDate']-filtered_rw[filtered_rw['Biomedicine']==0]['OriginalPaperDate']) / pd.Timedelta(days=30.4375))
median_ttr_non_biomed

In [None]:
median_ttr = ["median time-to-retraction", median_ttr_total, median_ttr_biomed, median_ttr_non_biomed]

In [None]:
ttr_stats = pd.DataFrame([avg_ttr, median_ttr], columns = ["statistic", "total", "biomed", "non-biomed"])
ttr_stats

In [None]:
filtered_rw['corresponding_author'] = filtered_rw['Author'].str.split(';').str[0]

In [None]:
filtered_rw['time_to_retraction'] = ((filtered_rw['RetractionDate']-filtered_rw['OriginalPaperDate']) / pd.Timedelta(days=30.4375))

In [None]:
retractions_only_first_time = filtered_rw.sort_values(by='RetractionDate', inplace = False)
retractions_only_first_time = retractions_only_first_time.groupby('corresponding_author').first().reset_index()
retractions_only_first_time.head()

In [None]:
ttr_total_stats = retractions_only_first_time['time_to_retraction'].describe().rename('ttr_total_stats')
ttr_biomed_stats = retractions_only_first_time[retractions_only_first_time['Biomedicine']==1]['time_to_retraction'].describe().rename('ttr_biomed_stats')
ttr_non_biomed_stats = retractions_only_first_time[retractions_only_first_time['Biomedicine']==0]['time_to_retraction'].describe().rename('ttr_non_biomed_stats')
ttr_stats = pd.concat([ttr_total_stats, ttr_biomed_stats, ttr_non_biomed_stats], axis = 1)
ttr_stats.round(1)

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 1 - Basic Retraction Statistics <a class="anchor" id="chapter1"></a>

In [None]:
retractions['time_to_retraction'] = ((retractions['RetractionDate']-retractions['OriginalPaperDate']) / pd.Timedelta(days=30.4375))

In [None]:
classif_areas = pd.read_excel('./research_areas_wos_classified.xlsx')
classif_areas

In [None]:
def create_biomed_variable(df, id):
    areas = df[[id,'research_areas']]
    areas.loc[:, 'research_areas'] = df['research_areas'].str.split(';')
    areas = areas.explode('research_areas')
    areas = areas[areas['research_areas'].str.strip() != ""]
    areas['research_areas'] = areas['research_areas'].str.strip()
    
    areas = areas.merge(classif_areas, how= 'left', left_on = 'research_areas', right_on = 'Área científica')
    areas['Biomedicine'] = np.where(areas['Classificação Isabel'] >= 1, 1, 0)

    df = df.merge(areas.groupby(id)['Biomedicine'].sum(), how='left', on = id)
    df['Biomedicine'] = np.where(df['Biomedicine']>=1, 1, 0)

    return df

In [None]:
control_set = create_biomed_variable(control_set, 'doi')

In [None]:
retractions = create_biomed_variable(retractions, 'Record ID')

In [None]:
retractions.info()

In [None]:
retractions['corresponding_author'] = retractions['Author'].str.split(';').str[0]

In [None]:
for index, row in retractions.iterrows():
    retractions.loc[index, 'nr_of_offenses'] = retractions['Author'].str.contains(row['corresponding_author']).sum()

In [None]:
def map_to_category(nr_of_offenses):
    if nr_of_offenses == 1:
        return "1"
    elif 2 <= nr_of_offenses <= 10:
        return "2-10"
    else:
        return ">10"

In [None]:
retractions['groups_authors'] = retractions['nr_of_offenses'].apply(map_to_category)

In [None]:
retractions_only_first_time = retractions.sort_values(by='RetractionDate', inplace = False)
retractions_only_first_time = retractions_only_first_time.groupby('corresponding_author').first().reset_index()
retractions_only_first_time.head()

In [None]:
ttr_total_stats = retractions['time_to_retraction'].describe().rename('ttr_total_stats')
ttr_biomed_stats = retractions[retractions['Biomedicine']==1]['time_to_retraction'].describe().rename('ttr_biomed_stats')
ttr_non_biomed_stats = retractions[retractions['Biomedicine']==0]['time_to_retraction'].describe().rename('ttr_non_biomed_stats')
ttr_stats = pd.concat([ttr_total_stats, ttr_biomed_stats, ttr_non_biomed_stats], axis = 1)
ttr_stats.round(1)

In [None]:
# Plotting
sns.kdeplot(ttr_total_stats, label='TTR Total Stats', linestyle='-')
sns.kdeplot(ttr_biomed_stats, label='TTR Biomed Stats', linestyle='--')
sns.kdeplot(ttr_non_biomed_stats, label='TTR Non-Biomed Stats', linestyle=':')

# Customize plot (labels, title, etc.)
plt.xlabel('Number of months to retraction')  # Customize X-axis label
plt.ylabel('Density')  # Customize Y-axis label
plt.title('Density Distribution of TTR Stats')  # Add title
plt.legend()  # Show legend

# Show plot
plt.show()

In [None]:
ttr_total_stats = retractions_only_first_time['time_to_retraction'].describe().rename('ttr_total_stats')
ttr_biomed_stats = retractions_only_first_time[retractions_only_first_time['Biomedicine']==1]['time_to_retraction'].describe().rename('ttr_biomed_stats')
ttr_non_biomed_stats = retractions_only_first_time[retractions_only_first_time['Biomedicine']==0]['time_to_retraction'].describe().rename('ttr_non_biomed_stats')
ttr_stats = pd.concat([ttr_total_stats, ttr_biomed_stats, ttr_non_biomed_stats], axis = 1)
ttr_stats.round(1)

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 2 - Temporal trends <a class="anchor" id="chapter2"></a>

In [None]:
retractions.columns

In [None]:
retractions_grouped = retractions[retractions['year_published'] != 2023].groupby(retractions['year_published'])['time_to_retraction'].mean().reset_index(name= 'Mean TTR of Retractions')

# Merge the two grouped DataFrames on the 'year_published' column
year_counts = retractions_grouped.copy()

In [None]:
ttr_of_biomed_retracted = retractions[(retractions['year_published'] != 2023) & (retractions['Biomedicine'] ==1)].groupby(retractions['year_published'])['time_to_retraction'].mean().reset_index(name= 'Mean TTR of Retractions in Biomedicine')
ttr_of_non_biomed_retracted = retractions[(retractions['year_published'] != 2023) & (retractions['Biomedicine'] ==0)].groupby(retractions['year_published'])['time_to_retraction'].mean().reset_index(name= 'Mean TTR of Retractions not in Biomedicine')

In [None]:
year_counts = pd.merge(year_counts, ttr_of_biomed_retracted, how='left', on='year_published')
year_counts = pd.merge(year_counts, ttr_of_non_biomed_retracted, how='left', on='year_published')

In [None]:
year_counts

In [1]:
plt.plot(year_counts['year_published'], year_counts['Mean TTR of Retractions in Biomedicine'], label = 'Mean TTR of Retractions in Biomedicine', color="b")
plt.plot(year_counts['year_published'], year_counts['Mean TTR of Retractions not in Biomedicine'], label = 'Mean TTR of Retractions not in Biomedicine', color="g")

plt.xlabel('Year')
plt.ylabel('Number of Retractions')
plt.title('Time to retraction of Articles')
plt.legend()
plt.show()

NameError: name 'plt' is not defined

: 

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 3 - Reasons for Retraction <a class="anchor" id="chapter3"></a>

In [None]:
motives = retractions[['Record ID','Reason', 'Biomedicine']]
motives.loc[:, 'Reason'] = motives['Reason'].str.split(';')
motives = motives.explode('Reason')
motives.loc[:, 'Reason'] = motives['Reason'].str.replace('+', '')
motives = motives[motives['Reason'].str.strip() != ""]
motives


In [None]:
top10_reasons_biomed = motives['Reason'].value_counts()[:10].reset_index(name = "Biomed")
top10_reasons_biomed

In [None]:
top10_reasons_non_biomed = motives[motives['Biomedicine']==1]['Reason'].value_counts()[:10].reset_index(name = "Non Biomed")
top10_reasons_non_biomed

In [None]:
top10_reasons = pd.concat([top10_reasons_biomed, top10_reasons_non_biomed], axis=1)
top10_reasons

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 4 - Geographical Differences <a class="anchor" id="chapter4"></a>

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 5 - Analysis per Journal <a class="anchor" id="chapter5"></a>

<div class="alert alert-block alert-info" style = "background:#d0de6f; color:#000000; border:0;">

# Chapter 6 - Analysis per Scientific area <a class="anchor" id="chapter6"></a>