# Perform correlation analysis between Long and short disorder predictions

## 1. scatter plot for long disorder MIDS per instance vs. short disorder MIDS per instance predictions colored by ELM type in eukaryotes (interactive plot)

In [None]:
import plotly.express as px
import pandas as pd
df = pd.read_csv('eukaryotes/eukaryotes_TP_full_predictions.csv')
elm_type_palette= [ '#006BA4', '#FF800E', '#ABABAB',  '#595959', '#5F9ED1', '#C85200']
    

fig = px.scatter(df, x="Short_MIDS_per_instance", y="Long_MIDS_per_instance", color='ELMType', title='Eukaryotes Long vs Short IUPRED2A MIDS',
                 symbol='ELMType', hover_data=['ELMIdentifier', 'ELMType'], color_discrete_sequence=elm_type_palette, symbol_sequence= ['circle', 'diamond', 'square', 'x', 'cross', 'circle'])
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)', yaxis_range=[0,1], xaxis_range=[0,1], font_family='Arial', title_font_family='Arial', title_x=0.5, title_font_size=28, font_color='black', legend = dict(font = dict(size = 20)),legend_title = dict(font = dict(size = 18)))
fig.update_xaxes(ticks='outside', tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1],tickfont=dict( size=18), showline=True, linewidth=2, linecolor='black', title_text = "Short MIDS per instance",title_font = {"size": 20})
fig.update_yaxes(ticks='outside', tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1], tickfont=dict(size=18),showline=True, linewidth=2, linecolor='black', title_text = "Long MIDS per instance",title_font = {"size": 20})
fig.show()

## Perform the correlation calculations by ELM type for eukaryotes

In [None]:
import pandas as pd
from scipy.stats import spearmanr

df = pd.read_csv('eukaryotes/eukaryotes_TP_full_predictions.csv')

# Extract data needed for the correlation analysis
#Long IUPRED2A disorder vs Short IUPRED2A disorder

# 1. CLV type

eukaryotes_long_CLV=df[df['ELMType'] == 'CLV'].Long_MIDS_per_instance.values
eukaryotes_short_CLV=df[df['ELMType'] == 'CLV'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(eukaryotes_long_CLV, eukaryotes_short_CLV)
print('{},{},{:e}\n'.format('eukaryotes_long_CLV Vs eukaryotes_short_CLV', coef, p))


# 2. DEG type

eukaryotes_long_DEG=df[df['ELMType'] == 'DEG'].Long_MIDS_per_instance.values
eukaryotes_short_DEG=df[df['ELMType'] == 'DEG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(eukaryotes_long_DEG, eukaryotes_short_DEG)
print('{},{},{:e}\n'.format('eukaryotes_long_DEG Vs eukaryotes_short_DEG', coef, p))


# 3. DOC type

eukaryotes_long_DOC=df[df['ELMType'] == 'DOC'].Long_MIDS_per_instance.values
eukaryotes_short_DOC=df[df['ELMType'] == 'DOC'].Short_MIDS_per_instance

# Perform correlation analysis
coef, p =spearmanr(eukaryotes_long_DOC, eukaryotes_short_DOC)
print('{},{},{:e}\n'.format('eukaryotes_long_DOC Vs eukaryotes_short_DOC', coef, p))



# 4. LIG type

eukaryotes_long_LIG=df[df['ELMType'] == 'LIG'].Long_MIDS_per_instance.values
eukaryotes_short_LIG=df[df['ELMType'] == 'LIG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(eukaryotes_long_LIG, eukaryotes_short_LIG)
print('{},{},{:e}\n'.format('eukaryotes_long_LIG Vs eukaryotes_short_LIG', coef, p))



# 5. MOD type

eukaryotes_long_MOD=df[df['ELMType'] == 'MOD'].Long_MIDS_per_instance.values
eukaryotes_short_MOD=df[df['ELMType'] == 'MOD'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(eukaryotes_long_MOD, eukaryotes_short_MOD)
print('{},{},{:e}\n'.format('eukaryotes_long_MOD Vs eukaryotes_short_MOD', coef, p))


# 6. TRG type

eukaryotes_long_TRG=df[df['ELMType'] == 'TRG'].Long_MIDS_per_instance.values
eukaryotes_short_TRG=df[df['ELMType'] == 'TRG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(eukaryotes_long_TRG, eukaryotes_short_TRG)
print('{},{},{:e}\n'.format('eukaryotes_long_TRG Vs eukaryotes_short_TRG', coef, p))


# 7. Eukaryotes Long disorder vs Short disorder
eukaryotes_long=df.Long_MIDS_per_instance.values
eukaryotes_short=df.Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(eukaryotes_long, eukaryotes_short)
print('{},{},{:e}\n'.format('eukaryotes_long_disorder Vs eukaryotes_short_disorder', coef, p))


## 2. scatter plot for long disorder MIDS per instance vs. short disorder MIDS per instance predictions colored by ELM type in bacteria (interactive plot)

In [None]:
import plotly.express as px
import pandas as pd
df = pd.read_csv('bacteria/bacteria_TP_full_predictions.csv')
elm_type_palette= [ '#006BA4', '#ABABAB',  '#595959', '#5F9ED1', '#C85200']

fig = px.scatter(df, x="Short_MIDS_per_instance", y="Long_MIDS_per_instance", color="ELMType", title='Bacteria Long vs Short IUPRED2A MIDS',
                 symbol='ELMType', hover_data=['ELMIdentifier', 'ELMType'], color_discrete_sequence=elm_type_palette, symbol_sequence= ['circle', 'square', 'x', 'cross', 'circle'])
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)', yaxis_range=[0,1], xaxis_range=[0,1], font_family='Arial', title_font_family='Arial', title_x=0.5, title_font_size=28, font_color='black', legend = dict(font = dict(size = 20)),legend_title = dict(font = dict(size = 18)))
fig.update_xaxes(ticks='outside', tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1],tickfont=dict( size=18), showline=True, linewidth=2, linecolor='black', title_text = "Short MIDS per instance",title_font = {"size": 20})
fig.update_yaxes(ticks='outside', tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1], tickfont=dict(size=18),showline=True, linewidth=2, linecolor='black', title_text = "Long MIDS per instance",title_font = {"size": 20})
fig.show()

## Perform the correlation calculations by ELM type for bacteria

In [None]:
import pandas as pd
from scipy.stats import spearmanr

df = pd.read_csv('bacteria/bacteria_TP_full_predictions.csv')

# Extract data needed for the correlation analysis
#Long IUPRED2A disorder vs Short IUPRED2A disorder

# 1. CLV type

bacteria_long_CLV=df[df['ELMType'] == 'CLV'].Long_MIDS_per_instance.values
bacteria_short_CLV=df[df['ELMType'] == 'CLV'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(bacteria_long_CLV, bacteria_short_CLV)
print('{},{},{:e}\n'.format('bacteria_long_CLV Vs bacteria_short_CLV', coef, p))


# 2. DOC type

bacteria_long_DOC=df[df['ELMType'] == 'DOC'].Long_MIDS_per_instance.values
bacteria_short_DOC=df[df['ELMType'] == 'DOC'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(bacteria_long_DOC, bacteria_short_DOC)
print('{},{},{:e}\n'.format('bacteria_long_DOC Vs bacteria_short_DOC', coef, p))



# 3. LIG type

bacteria_long_LIG=df[df['ELMType'] == 'LIG'].Long_MIDS_per_instance.values
bacteria_short_LIG=df[df['ELMType'] == 'LIG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(bacteria_long_LIG, bacteria_short_LIG)
print('{},{},{:e}\n'.format('bacteria_long_LIG Vs bacteria_short_LIG', coef, p))



# 4. MOD type

bacteria_long_MOD=df[df['ELMType'] == 'MOD'].Long_MIDS_per_instance.values
bacteria_short_MOD=df[df['ELMType'] == 'MOD'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(bacteria_long_MOD, bacteria_short_MOD)
print('{},{},{:e}\n'.format('bacteria_long_MOD Vs bacteria_short_MOD', coef, p))


# 5. TRG type

bacteria_long_TRG=df[df['ELMType'] == 'TRG'].Long_MIDS_per_instance.values
bacteria_short_TRG=df[df['ELMType'] == 'TRG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(bacteria_long_TRG, bacteria_short_TRG)
print('{},{},{:e}\n'.format('bacteria_long_TRG Vs bacteria_short_TRG', coef, p))


# 6. Bacteria Long disorder vs Short disorder
bacteria_long=df.Long_MIDS_per_instance.values
bacteria_short=df.Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(bacteria_long, bacteria_short)
print('{},{},{:e}\n'.format('bacteria_long_disorder Vs bacteria_short_disorder', coef, p))


## 3. scatter plot for long disorder MIDS per instance vs. short disorder MIDS per instance predictions colored by ELM type in viruses (interactive plot)

In [None]:
import plotly.express as px
import pandas as pd
df = pd.read_csv('viruses/viruses_TP_full_predictions.csv')
elm_type_palette= [ '#006BA4', '#FF800E', '#ABABAB',  '#595959', '#5F9ED1', '#C85200']

fig = px.scatter(df, x="Short_MIDS_per_instance", y="Long_MIDS_per_instance", color='ELMType', title='viruses Long vs Short IUPRED2A MIDS',
                 symbol='ELMType', hover_data=['ELMIdentifier', 'ELMType'], color_discrete_sequence=elm_type_palette, symbol_sequence= ['circle', 'diamond', 'square', 'x', 'cross', 'circle'])
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)', yaxis_range=[0,1], xaxis_range=[0,1], font_family='Arial', title_font_family='Arial', title_x=0.5, title_font_size=28, font_color='black', legend = dict(font = dict(size = 20)),legend_title = dict(font = dict(size = 18)))
fig.update_xaxes(ticks='outside', tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1],tickfont=dict( size=18), showline=True, linewidth=2, linecolor='black', title_text = "Short MIDS per instance",title_font = {"size": 20})
fig.update_yaxes(ticks='outside', tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1], tickfont=dict(size=18),showline=True, linewidth=2, linecolor='black', title_text = "Long MIDS per instance",title_font = {"size": 20})
fig.show()

## Perform the correlation calculations by ELM type for viruses

In [None]:
import pandas as pd
from scipy.stats import spearmanr

df = pd.read_csv('viruses/viruses_TP_full_predictions.csv')

# Extract data needed for the correlation analysis
#Long IUPRED2A disorder vs Short IUPRED2A disorder

# 1. CLV type

viruses_long_CLV=df[df['ELMType'] == 'CLV'].Long_MIDS_per_instance.values
viruses_short_CLV=df[df['ELMType'] == 'CLV'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long_CLV, viruses_short_CLV)
print('{},{},{:e}\n'.format('viruses_long_CLV Vs viruses_short_CLV', coef, p))


# 2. DEG type

viruses_long_DEG=df[df['ELMType'] == 'DEG'].Long_MIDS_per_instance.values
viruses_short_DEG=df[df['ELMType'] == 'DEG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long_DEG, viruses_short_DEG)
print('{},{},{:e}\n'.format('viruses_long_DEG Vs viruses_short_DEG', coef, p))


# 3. DOC type

viruses_long_DOC=df[df['ELMType'] == 'DOC'].Long_MIDS_per_instance.values
viruses_short_DOC=df[df['ELMType'] == 'DOC'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long_DOC, viruses_short_DOC)
print('{},{},{:e}\n'.format('viruses_long_DOC Vs viruses_short_DOC', coef, p))



# 4. LIG type

viruses_long_LIG=df[df['ELMType'] == 'LIG'].Long_MIDS_per_instance.values
viruses_short_LIG=df[df['ELMType'] == 'LIG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long_LIG, viruses_short_LIG)
print('{},{},{:e}\n'.format('viruses_long_LIG Vs viruses_short_LIG', coef, p))



# 5. MOD type

viruses_long_MOD=df[df['ELMType'] == 'MOD'].Long_MIDS_per_instance.values
viruses_short_MOD=df[df['ELMType'] == 'MOD'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long_MOD, viruses_short_MOD)
print('{},{},{:e}\n'.format('viruses_long_MOD Vs viruses_short_MOD', coef, p))


# 6. TRG type

viruses_long_TRG=df[df['ELMType'] == 'TRG'].Long_MIDS_per_instance.values
viruses_short_TRG=df[df['ELMType'] == 'TRG'].Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long_TRG, viruses_short_TRG)
print('{},{},{:e}\n'.format('viruses_long_TRG Vs viruses_short_TRG', coef, p))


# 7. Eukaryotes Long disorder vs Short disorder
viruses_long=df.Long_MIDS_per_instance.values
viruses_short=df.Short_MIDS_per_instance

# Perform correlation analysis 
coef, p =spearmanr(viruses_long, viruses_short)
print('{},{},{:e}\n'.format('viruses_long_disorder Vs viruses_short_disorder', coef, p))
