# Women in chess

The objective of this notebook is to analyze possible causes of the lower rating of women in chess compared to men. 
There are several questions I want to address:
- percentage of women playing chess
- women rating along time
- women top level at different ages and/or along last years

## Percentage of women playing chess

In [None]:
#import packages
import pandas as pd
import os
from plotnine import *

In [None]:
#Read data
#FIDE files have 2 different formats. Files with each format are in a different folder
#Read files with old format
old_names = os.listdir("data\old_format")

#Create 2 list to store information
df_list_number = []#List for the percentage of women
df_list_elo = []#List for the difference in rating

for name in old_names:
    year = name[3:5]#Get year from file name
    data = pd.read_fwf(f'data\old_format\{name}', delimiter=' ')
    data_w = data.loc[(data.Flag == "w") | (data.Flag == "wi")]
    data_m = data.loc[(data.Flag != "w") & (data.Flag != "wi")]
    percent_w = data_w.shape[0] / data.shape[0] *100
    percent_m = 100 - percent_w
    elo_w = data_w.iloc[:,3 ].mean()
    elo_m = data_m.iloc[:,3 ].mean()
    #Add info to list as dictionary
    df_list_number.append({"year": "20" + str(year), "percent": percent_w, "Sex": "Women"})
    df_list_number.append({"year": "20" + str(year), "percent": percent_m, "Sex": "Men"})
    df_list_elo.append({"year": "20" + str(year), "elo": elo_w, "Sex": "Women"})
    df_list_elo.append({"year": "20" + str(year), "elo": elo_m, "Sex": "Men"})

#Read files with new format
new_names = os.listdir("data\\new_format")

for name in new_names:
    year = pd.to_numeric(name[12:14])
    if year < 17:#Format changed after 16, adding one column. This selects the right column for ranking
        num = 7
    else:
        num=8
    data = pd.read_fwf(f'data\\new_format\{name}', delimiter=' ')
    data.iloc[:, num] = pd.to_numeric(data.iloc[:, num], errors="coerce")
    data_w = data.loc[data.Sex == "F"]
    data_m = data.loc[data.Sex == "M"]
    percent_w = data_w.shape[0] / data.shape[0] * 100
    percent_m = 100 - percent_w
    elo_w = data_w.iloc[:,num].mean()
    elo_m = data_m.iloc[:,num].mean()

    df_list_number.append({"year": "20" + str(year), "percent": percent_w, "Sex": "Women"})
    df_list_number.append({"year": "20" + str(year), "percent": percent_m, "Sex": "Men"})
    df_list_elo.append({"year": "20" + str(year), "elo": elo_w, "Sex": "Women"})
    df_list_elo.append({"year": "20" + str(year), "elo": elo_m, "Sex": "Men"})

#Convert lists of dictionaries into dataframes
df_number = pd.DataFrame(df_list_number, columns= ["year", "percent", "Sex"])
df_elo = pd.DataFrame(df_list_elo, columns= ["year", "elo", "Sex"])

In [None]:
#Plot percentage of women per year
df_number.year = pd.to_numeric(df_number.year)

df_number = df_number.sort_values("year")

#Plot percentage of women per year
ggplot(df_number[df_number.Sex == "Women"], aes(x="year", y="percent")) + \
    geom_bar(stat="identity", fill = "Blue") + \
    labs(x="Year", y= "Percentage women", title = "Percentage of women in chess") + \
    coord_cartesian(ylim=(0, 12)) + \
    theme_classic()

We can see a clear increase in the percentage of women per year. The proportion almost doubles in the last 15 years.

## Rating difference between men and women

In [None]:
df_elo.year = pd.to_numeric(df_elo.year)
df_elo = df_elo.sort_values("year")

#Plot mean rating in men and women per year
ggplot(df_elo, aes(x="year", y = "elo", fill="Sex")) + \
    geom_bar(stat="identity", position = "dodge") + \
    labs(x="Year", y="Elo rating", title="Elo rating for men and women") + \
    coord_cartesian(ylim=(1000, 2100) )+ \
    theme_classic() + \
    theme(legend_position = (0.8, 0.8))

The rating difference was lower 15 years ago, but this is difficult to compare because the floor rating for players had decreased in the last years. Anyway, it does not seem to be a change in the last 5-6 years.

## Women representation in top 100 by age

In [None]:
#Get number of women in top 100 for 4 differnt years: 2006, 2011, 2016, 2021
#read data into dataframes
data06 = pd.read_fwf('data\old_format\JUL06FRL.TXT', delimiter=' ')
data06[['games','year']] = data06.GamesBorn.str.split("  ",expand=True,)#Split wrong column
data06["year"] = pd.to_numeric(data06["year"])
data06["games"] = pd.to_numeric(data06["games"])
data06["age"] = 2006 - data06["year"]
data06.dropna(subset=["Jul06"], inplace=True)


data11 = pd.read_fwf('data\old_format\may11frl.txt', delimiter=' ')
data11[['games','year']] = data11.GamesBorn.str.split("  ",expand=True,)#Split wrong column
data11["year"] = pd.to_numeric(data11["year"])
data11["games"] = pd.to_numeric(data11["games"])
data11["age"] = 2011 - data11["year"]
data11.dropna(subset=["May10"], inplace=True)

data16 = pd.read_fwf('data\\new_format\standard_may16frl.TXT', delimiter=' ')
data16.rename(columns={"B-day": "year"}, inplace = True)#Rename B-day column to remove "-"
data16["MAY16"] = pd.to_numeric(data16["MAY16"])
data16["year"] = pd.to_numeric(data16["year"])
data16["age"] = 2016 - data16["year"]

data21 = pd.read_fwf('data\\new_format\standard_mar21frl.TXT', delimiter=' ')
data21 = data21[data21.MAR21 != "M 189"] #Fix a wrong line
data21.rename(columns={"B-day": "year"}, inplace = True)#Rename B-day column to remove "-"
data21["MAR21"] = pd.to_numeric(data21["MAR21"])
data21["year"] = pd.to_numeric(data21["year"])
data21["age"] = 2021 - data21["year"]


In [None]:
##Create data frame with data
#Current year / 2021
#Remove inactive players
data21 = data21[data21.Flag != "i"]
data21 = data21[data21.Flag != "wi"]
ages = list(range(11, 44, 3))#Choose ages to analyze
df_list = []

for age in ages:
    data = data21[data21["age"] < age]
    data = data.sort_values("MAR21", ascending=False)
    data = data.head(100)
    percentage = data[data["Sex"] == "F"].shape[0]

    df_list.append({"age": "under " + str(age) , "percentage": percentage, "year": 2021})#Append info as dictionary

#Past years
#Year 16
data16 = data16[data16.Flag != "i"]
data16 = data16[data16.Flag != "wi"]

for age in ages:
    data = data16[data16["age"] < age]
    data = data.sort_values("MAY16", ascending=False)
    data = data.head(100)
    percentage = data[data["Sex"] == "F"].shape[0]

    df_list.append({"age": "under " + str(age) , "percentage": percentage, "year": 2016})

#Year 11
data11 = data11[data11.Flag != "i"]
data11 = data11[data11.Flag != "wi"]


for age in ages:
    data = data11[data11["age"] < age]
    data = data.sort_values("May10", ascending=False)
    data = data.head(100)
    percentage = data[data["Flag"] == "w"].shape[0]

    df_list.append({"age": "under " + str(age) , "percentage": percentage, "year": 2011})


#Year 06
data06 = data06[data06.Flag != "i"]
data06 = data06[data06.Flag != "wi"]

for age in ages:
    data = data06[data06["age"] < age]
    data = data.sort_values("Jul06", ascending=False)
    data = data.head(100)
    percentage = data[data["Flag"] == "w"].shape[0]

    df_list.append({"age": "under " + str(age) , "percentage": percentage, "year": 2006})

#convert list of dictionaries into dataframe
df = pd.DataFrame(df_list, columns= ["age", "percentage", "year"])

In [None]:
#Plot data for 2021
ggplot(df[df.year == 2021], aes(x="age", y="percentage")) + \
    geom_bar(stat="identity", fill = "Blue") + \
    labs(x="Age", y= "Number of women", title = "Number of women in top 100 players - 2021") + \
    theme_classic() + \
    theme(axis_text_x=element_text(rotation=45, hjust=1))

There is a sharp decrease of women at top level during teenage years. Is this a trend that has happened before?

In [None]:
#Plot all data
ggplot(df, aes(x="age", y="percentage")) + \
    geom_bar(stat="identity", fill = "Blue") + \
    labs(y= "Number of women", title = "Number of women in top 100 players") + \
    theme_classic() + \
    theme(axis_text_x=element_text(rotation=45, hjust=1)) + \
    facet_wrap ("~year")

We see that the number of women in top level decreases with age, specially after starting adult age. There could be social causes to explain this drop. 