# Gender overview at universities

In [1]:
import pandas as pd
import numpy as np

import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
df = pd.read_csv('unis_data.csv', index_col = 0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13325 entries, 0 to 2344
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               13325 non-null  int64  
 1   rank_order                         13325 non-null  int64  
 2   rank                               13325 non-null  object 
 3   name                               13325 non-null  object 
 4   scores_overall                     12329 non-null  object 
 5   scores_overall_rank                13325 non-null  int64  
 6   scores_teaching                    12329 non-null  float64
 7   scores_teaching_rank               13325 non-null  int64  
 8   scores_international_outlook       12329 non-null  object 
 9   scores_international_outlook_rank  13325 non-null  int64  
 10  scores_industry_income             12329 non-null  object 
 11  scores_industry_income_rank        13325 non-null  int6

In [4]:
df = df.loc[:, ['year', 'name', 'location', 'stats_number_students', 'stats_female_male_ratio']]
df

Unnamed: 0,year,name,location,stats_number_students,stats_female_male_ratio
0,2011,Harvard University,United States,,
1,2011,California Institute of Technology,United States,,
2,2011,Massachusetts Institute of Technology,United States,,
3,2011,Stanford University,United States,,
4,2011,Princeton University,United States,,
...,...,...,...,...,...
2340,2023,York St John University,United Kingdom,6315,65 : 35
2341,2023,"Yusuf Maitama Sule University, Kano",Nigeria,1288,48 : 52
2342,2023,Zhytomyr Polytechnic State University,Ukraine,3869,34 : 66
2343,2023,Ziauddin University,Pakistan,4906,63 : 37


In [5]:
df.isnull().sum()

year                          0
name                          0
location                      0
stats_number_students      1803
stats_female_male_ratio    2330
dtype: int64

In [6]:
df_help = df.dropna()
df_help

Unnamed: 0,year,name,location,stats_number_students,stats_female_male_ratio
0,2016,California Institute of Technology,United States,2243,33 : 67
1,2016,University of Oxford,United Kingdom,1992,46 : 54
2,2016,Stanford University,United States,15596,42 : 58
3,2016,University of Cambridge,United Kingdom,1881,46 : 54
4,2016,Massachusetts Institute of Technology,United States,11074,37 : 63
...,...,...,...,...,...
2340,2023,York St John University,United Kingdom,6315,65 : 35
2341,2023,"Yusuf Maitama Sule University, Kano",Nigeria,1288,48 : 52
2342,2023,Zhytomyr Polytechnic State University,Ukraine,3869,34 : 66
2343,2023,Ziauddin University,Pakistan,4906,63 : 37


Data on ratio starts in 2016

In [7]:
df = df[df.year >= 2016]
df

Unnamed: 0,year,name,location,stats_number_students,stats_female_male_ratio
0,2016,California Institute of Technology,United States,2243,33 : 67
1,2016,University of Oxford,United Kingdom,1992,46 : 54
2,2016,Stanford University,United States,15596,42 : 58
3,2016,University of Cambridge,United Kingdom,1881,46 : 54
4,2016,Massachusetts Institute of Technology,United States,11074,37 : 63
...,...,...,...,...,...
2340,2023,York St John University,United Kingdom,6315,65 : 35
2341,2023,"Yusuf Maitama Sule University, Kano",Nigeria,1288,48 : 52
2342,2023,Zhytomyr Polytechnic State University,Ukraine,3869,34 : 66
2343,2023,Ziauddin University,Pakistan,4906,63 : 37


In [8]:
df.isnull().sum()

year                         0
name                         0
location                     0
stats_number_students        0
stats_female_male_ratio    527
dtype: int64

In [9]:
df.loc[:, 'female_percentage'] = df['stats_female_male_ratio'].str.slice(0, 2)
df

Unnamed: 0,year,name,location,stats_number_students,stats_female_male_ratio,female_percentage
0,2016,California Institute of Technology,United States,2243,33 : 67,33
1,2016,University of Oxford,United Kingdom,1992,46 : 54,46
2,2016,Stanford University,United States,15596,42 : 58,42
3,2016,University of Cambridge,United Kingdom,1881,46 : 54,46
4,2016,Massachusetts Institute of Technology,United States,11074,37 : 63,37
...,...,...,...,...,...,...
2340,2023,York St John University,United Kingdom,6315,65 : 35,65
2341,2023,"Yusuf Maitama Sule University, Kano",Nigeria,1288,48 : 52,48
2342,2023,Zhytomyr Polytechnic State University,Ukraine,3869,34 : 66,34
2343,2023,Ziauddin University,Pakistan,4906,63 : 37,63


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11522 entries, 0 to 2344
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   year                     11522 non-null  int64 
 1   name                     11522 non-null  object
 2   location                 11522 non-null  object
 3   stats_number_students    11522 non-null  object
 4   stats_female_male_ratio  10995 non-null  object
 5   female_percentage        10995 non-null  object
dtypes: int64(1), object(5)
memory usage: 630.1+ KB


In [11]:
#df['female_percentage'] = df['female_percentage'].astype(float)
df.loc[:, 'female_percentage'] = df['female_percentage'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11522 entries, 0 to 2344
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     11522 non-null  int64  
 1   name                     11522 non-null  object 
 2   location                 11522 non-null  object 
 3   stats_number_students    11522 non-null  object 
 4   stats_female_male_ratio  10995 non-null  object 
 5   female_percentage        10995 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 630.1+ KB


In [12]:
#This is not an important column for the analysis but here is a method to change "," to "." 
#In this case in some numbers there are two "," - second one is replaced with '.', the first one with an empty string 

df['stats_number_students'] = [x.replace(',', '.') for x in df['stats_number_students']]
df['stats_number_students'] = df['stats_number_students'].str.replace('^([^.]+)\.([^.]+\.)', r'\1\2', regex=True)
df['stats_number_students'] = df['stats_number_students'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11522 entries, 0 to 2344
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     11522 non-null  int64  
 1   name                     11522 non-null  object 
 2   location                 11522 non-null  object 
 3   stats_number_students    11522 non-null  float64
 4   stats_female_male_ratio  10995 non-null  object 
 5   female_percentage        10995 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 630.1+ KB


In [13]:
df = df.fillna(value = df['female_percentage'].mean())
df

Unnamed: 0,year,name,location,stats_number_students,stats_female_male_ratio,female_percentage
0,2016,California Institute of Technology,United States,2.243,33 : 67,33.0
1,2016,University of Oxford,United Kingdom,19.920,46 : 54,46.0
2,2016,Stanford University,United States,15.596,42 : 58,42.0
3,2016,University of Cambridge,United Kingdom,18.810,46 : 54,46.0
4,2016,Massachusetts Institute of Technology,United States,11.074,37 : 63,37.0
...,...,...,...,...,...,...
2340,2023,York St John University,United Kingdom,6.315,65 : 35,65.0
2341,2023,"Yusuf Maitama Sule University, Kano",Nigeria,12.880,48 : 52,48.0
2342,2023,Zhytomyr Polytechnic State University,Ukraine,3.869,34 : 66,34.0
2343,2023,Ziauddin University,Pakistan,4.906,63 : 37,63.0


In [14]:
df.isnull().sum().sum()

0

In [20]:
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly.graph_objs as go

warnings.filterwarnings("ignore")

In [31]:
data = df.groupby('year')['female_percentage'].mean().round(2)
df_perc = pd.DataFrame(data.values, columns=['perc_mean'], index=data.index)
df_perc.reset_index(inplace=True)
df_perc

Unnamed: 0,year,perc_mean
0,2016,49.14
1,2017,49.14
2,2018,49.0
3,2019,49.17
4,2020,49.68
5,2021,49.94
6,2022,50.07
7,2023,50.23


In [32]:
config = {
    'toImageButtonOptions':
    {
        'format': 'png',
        'filename': 'Plot',
        'width': 1080,
        'height': 600,
        'scale': 12
    }
}

In [33]:
plot = px.bar(df_perc, x='year', y='perc_mean', color_discrete_sequence=px.colors.diverging.Geyser,
             labels={
                 'year' : 'Year',
                 'perc_mean' : 'Percentage'},
             )
plot = go.Figure(plot,layout_yaxis_range = [45,52])
plot.update_coloraxes(showscale=False)
plot.update_layout(
    title=dict(
        text='<b>Average female percentage in universities</b>\
              <br><i><sup>form 2011 to 2023</sup></i>',
        x=0.085,
        y=0.95,
        font=dict(
            family='Helvetica',
            size=25,
            color='#272b4f'
        )))

plot.add_hline(y=df_perc['perc_mean'].mean())

# Create text string:
explanation = "<b>Conclusion</b> <br>\
               On average, female to male ratio<br>\
               is 50 : 50 and it has been rising throughout<br>\
               the entire period from 2011 to 2023"

# Add explanation for trends:
plot.add_annotation(x=.615,
                    y=0.02,
                    text=explanation,
                    textangle=0,
                    xanchor='left',
                    xref="paper",
                    yref="paper",
                    font_color='black',
                    bordercolor='black',
                    borderpad=5,
                    bgcolor='white',
                    showarrow=False
                    )
             

plot.update_traces(dict(marker_line_width=0))
plot.show(config=config)

In [41]:
df2022 = df[df.year == 2022]
df2022

Unnamed: 0,year,name,location,stats_number_students,stats_female_male_ratio,female_percentage
0,2022,University of Oxford,United Kingdom,20.835,47 : 53,47.0
1,2022,California Institute of Technology,United States,2.233,36 : 64,36.0
2,2022,Harvard University,United States,21.574,50 : 50,50.0
3,2022,Stanford University,United States,16.319,46 : 54,46.0
4,2022,University of Cambridge,United Kingdom,19.680,47 : 53,47.0
...,...,...,...,...,...,...
2107,2022,Yaşar University,Turkey,6.847,53 : 47,53.0
2108,2022,Yenepoya University,India,3.104,67 : 33,67.0
2109,2022,Yogyakarta State University,Indonesia,24.988,72 : 28,72.0
2110,2022,York St John University,United Kingdom,6.030,66 : 34,66.0


In [42]:
df2022 = df2022.groupby('location')['female_percentage'].mean().round(2).to_frame().reset_index()
df2022 = df2022.rename(columns={'location': 'Location', 'female_percentage':'Percentage'})
df2022

Unnamed: 0,Location,Percentage
0,Algeria,61.27
1,Argentina,56.10
2,Armenia,38.00
3,Australia,56.05
4,Austria,49.50
...,...,...
106,United States,52.17
107,Uruguay,48.00
108,Uzbekistan,40.20
109,Venezuela,60.00


In [70]:
plot = px.bar(df2022, x='Location', y='Percentage', color_discrete_sequence=px.colors.diverging.Geyser,)
plot.update_coloraxes(showscale=False)
plot.update_layout(
    title=dict(
        text='<b>Average female percentage in universities in 2022</b>\
              <br><i><sup>by location</sup></i>',
        x=0.085,
        y=0.95,
        font=dict(
            family='Helvetica',
            size=25,
            color='#272b4f'
        )))

perc_mean = round(float(df2022['Percentage'].mean()), 2)
plot.add_hline(y=perc_mean)

# Create text string:
explanation = "<b>Conclusion</b> <br>\
               In 2022 females accounted for {} %<br>\
               of the total number of students and workers at universitites<br>\
               however, in some countries females account for just about 20%.".format(perc_mean)

# Add explanation for trends:
plot.add_annotation(x=.495,
                    y=0.0,
                    text=explanation,
                    textangle=0,
                    xanchor='left',
                    xref="paper",
                    yref="paper",
                    font_color='black',
                    bordercolor='black',
                    borderpad=5,
                    bgcolor='white',
                    showarrow=False
                    )
             
plot.add_annotation(x=.05,
                    y=0.67,
                    text="Average percentage in 2022",
                    textangle=0,
                    xanchor='left',
                    xref="paper",
                    yref="paper",
                    font_color='black',
                    bordercolor='black',
                    borderpad=5,
                    showarrow=True,
                    arrowhead=2,
                    bgcolor='white',
                    arrowside='end'
                    )
plot.update_traces(dict(marker_line_width=0))
plot.show(config=config)

In [88]:
df_min = df2022.nsmallest(5, 'Percentage')
df_max = df2022.nlargest(5, 'Percentage')
df_min = df_min.rename(columns = {'Location': 'Location', 'Percentage': 'Minimum %'})
df_max = df_max.rename(columns = {'Location': 'Location', 'Percentage': 'Maximum %'})

In [117]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=("Minimum female %", "Maximum female %"))

fig.add_trace(
    go.Bar(x=df_min['Location'], y=df_min['Minimum %'], marker=dict(color='darkblue')),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=df_max['Location'], y=df_max['Maximum %'], marker=dict(color='royalblue')),
    row=1, col=2
)

fig.update_layout(height=600, width=900,
    title=dict(
        text='<b>Extremes</b>\
              <br><i><sup>in 2022</sup></i>',
        x=0.085,
        y=0.95,
        font=dict(
            family='Helvetica',
            size=25,
            color='#272b4f'
        )
    ),
    showlegend = False,
)

fig.update_xaxes(title_text='Country', row=1, col=1)
fig.update_xaxes(title_text='Country', row=1, col=2)
fig.update_yaxes(title_text='Female percentage',range=[15, 80], row=1, col=1)
fig.update_yaxes(title_text='Female percentage',range=[15, 80], row=1, col=2)
fig.show()