# Assignment 3

## Sourcing data from URL.

Author: Kyra Menai Hamilton

Brief: 

The note book should have a nice pie chart of peoples email domains in the csv file at the url

https://drive.google.com/uc?id=1AWPf-pJodJKeHsARQK_RHiNsE8fjPCVK&export=download

This csv file has 1000 people. You may download the data or link to it.

Marks will be given for:

Just creating the pie chart
Making it look nice
As always your code should be well laid out.

In [None]:
# Import the modules needed.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests


1. Source the data

In [None]:
# First need to save the data from the url.

df = pd.read_csv("https://drive.google.com/uc?id=1AWPf-pJodJKeHsARQK_RHiNsE8fjPCVK&export=download")
print(df.head(5)) # to check it worked.

2. Make a pie chart

Splitting the data based on the email address domain required splitting the email address to [get the domain](https://stackoverflow.com/questions/53044548/how-to-extract-domain-from-email-address-with-pandas) names and plot a [pie chart](https://stackoverflow.com/questions/72640253/how-to-plot-pie-chart-using-data-in-pandas).




In [None]:
df = pd.read_csv("https://drive.google.com/uc?id=1AWPf-pJodJKeHsARQK_RHiNsE8fjPCVK&export=download")

df['domain'] = df['Email'].str.split('@').str[1]
#faster solution if no NaNs values # based entirely on the assumption that there are no NaN values in the email column. https://stackoverflow.com/questions/53044548/how-to-extract-domain-from-email-address-with-pandas
#df['domain'] = [x.split('@')[1] for x in df['email']] 
print (df)

In [None]:
# Less common email domains grouped into 'Other' if there are many/any.
counts = df['domain'].value_counts()
# Decide how many slices to show (top_n). Remaining will be grouped into 'Other'
top_n = 10
if len(counts) > top_n:
    top = counts.iloc[:top_n]
    other = counts.iloc[top_n:].sum()
    labels = list(top.index) + ['Other']
    sizes = list(top.values) + [other]
else:
    labels = list(counts.index)
    sizes = list(counts.values)

# Convert sizes to percentages of the total
total = sum(sizes)
percent_sizes = [s / total * 100 for s in sizes]

fig, ax = plt.subplots(figsize=(8, 8))
wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.8)
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Email domains (% of total)')
# Improve text size
plt.setp(autotexts, size=10, weight='bold')
plt.setp(texts, size=9)

plt.show()

# Additional

## Visulalising proportion of male vs. female respondents.

In [None]:
# Plot the data on a pie chart to visualise the percentage of each sex in the dataset.
countsmf = df['Sex'].value_counts()
print(countsmf) # to check it worked.

# Create a pie chart showing percentage of total for each sex.
labels = 'Male', 'Female'
sizes = [506, 494]
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%')
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Sex (% of total)')
plt.show()

To make the data easier to explore, I decided to plot a pie plot for each of the 20 top job titles and display the percentage of male/female in those job roles.

It is interesting to note that if further analysis were to be done on this data, ensuring uniformity between the Job Titles would aid greatly in having correct counts (see Academic Librarian vs. librarian, academic for example).

In [None]:
# Create multiple pie charts (one per job title) showing male/female percentage for top roles
ct = pd.crosstab(df['Job Title'], df['Sex'])
for col in ['Male', 'Female']:
    if col not in ct.columns:
        ct[col] = 0
ct['total'] = ct.sum(axis=1)

# Select top roles by total count
top_n = 20
top_roles = ct.sort_values('total', ascending=False).head(top_n)
print('Top roles (by total people):')
print(top_roles['total'])

# Plot a grid of pie charts. used the histograms example from my pands project to aid in doing this: Plot- https://github.com/KaiiMenai/pands-project/blob/main/analysis.py Code- https://github.com/KaiiMenai/pands-project/blob/main/histograms_by_species.png
n = len(top_roles)
cols = 4
rows = (n + cols - 1) // cols
fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
axes = axes.flatten()
for i, (role, row) in enumerate(top_roles.iterrows()):
    male = row.get('Male', 0)
    female = row.get('Female', 0)
    sizes = [male, female]
    labels = ['Male', 'Female']
    # Avoid plotting empty pies
    if male + female == 0:
        axes[i].text(0.5, 0.5, 'No data', ha='center', va='center')
        axes[i].axis('off')
        continue
    wedges, texts, autotexts = axes[i].pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['#4C72B0', '#DD8452'])
    axes[i].set_title(f'{role} (n={int(row.total)})', fontsize=9)
    axes[i].axis('equal')  

# Remove any unused data axis in the pie plot by turning them 'off'. This ensures no pie chart is shown in empty subplots.
for j in range(n, len(axes)):
    axes[j].axis('off')
plt.tight_layout()
plt.show()


## Grouping by job title

In [None]:
# The data needs to be cleaned to remove any rows with missing data.
df = df.dropna()
print(df.head(5)) # to check it worked.

# Now to make a pie chart we need to group the data by Job Title and count the number of User Id in each Job Title role.
grouped = df.groupby('Job Title').count()['User Id']
print(grouped) # to check it worked.

# Need to Enable inline plotting for this Jupyter Notebook.
# %matplotlib inline
# Create a pie chart showing percentage of total for each Job Title.
# We'll group less common job titles into 'Other' if there are many categories to keep the chart readable.
counts = df['Job Title'].value_counts()
# Decide how many slices to show (top_n). Remaining will be grouped into 'Other'
top_n = 100
if len(counts) > top_n:
    top = counts.iloc[:top_n]
    other = counts.iloc[top_n:].sum()
    labels = list(top.index) + ['Other']
    sizes = list(top.values) + [other]
else:
    labels = list(counts.index)
    sizes = list(counts.values)

# Convert sizes to percentages of the total
total = sum(sizes)
percent_sizes = [s / total * 100 for s in sizes]

fig, ax = plt.subplots(figsize=(8, 8))
# Explode the largest slice slightly for emphasis
# explode = [0.05 if i == 0 else 0 for i in range(len(sizes))]
wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.8)
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Job titles (% of total)')
# Improve text size
plt.setp(autotexts, size=10, weight='bold')
plt.setp(texts, size=9)

plt.show()


## Splitting data by sex and looking at the Job Titles

In [None]:
# Now to divide the whole dataset into 2. one male and one female.
# then one pie chart for each showing the percentage job title role for each sex.

df_male = df[df['Sex'] == 'Male']
df_female = df[df['Sex'] == 'Female']
print(df_male.head(5)) # to check it worked.
print(df_female.head(5)) # to check it worked.
# Now to make a pie chart we need to group the data by Job Title and count the number of User Id in each Job Title role.
grouped_male = df_male.groupby('Job Title').count()['User Id']
grouped_female = df_female.groupby('Job Title').count()['User Id']
print(grouped_male)
print(grouped_female)

### Male

In [None]:
# Need to Enable inline plotting for this Jupyter Notebook.
# %matplotlib inline
# Create a pie chart showing percentage of total for each Job Title for males.
counts_male = df_male['Job Title'].value_counts()
# Decide how many slices to show (top_n). Remaining will be grouped into 'Other'
top_n = 10
if len(counts_male) > top_n:
    top_male = counts_male.iloc[:top_n]
    other_male = counts_male.iloc[top_n:].sum()
    labels_male = list(top_male.index) + ['Other']
    sizes_male = list(top_male.values) + [other_male]
else:
    labels_male = list(counts_male.index)
    sizes_male = list(counts_male.values)   
# Convert sizes to percentages of the total
total_male = sum(sizes_male)
percent_sizes_male = [s / total_male * 100 for s in sizes_male]
fig, ax = plt.subplots(figsize=(8, 8))
# Explode the largest slice slightly for emphasis
# explode = [0.05 if i == 0 else 0 for i in range(len(sizes))]
wedges, texts, autotexts = ax.pie(sizes_male, labels=labels_male, autopct='%1.1f%%', startangle=90, pctdistance=0.8)
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Male job titles (% of total)')
# Improve text size
plt.setp(autotexts, size=10, weight='bold')
plt.setp(texts, size=9)
plt.show()


### Female

In [None]:
# Create a pie chart showing percentage of total for each Job Title for females.
counts_female = df_female['Job Title'].value_counts()
# Decide how many slices to show (top_n). Remaining will be grouped into 'Other'    
top_n = 10
if len(counts_female) > top_n:
    top_female = counts_female.iloc[:top_n]
    other_female = counts_female.iloc[top_n:].sum()
    labels_female = list(top_female.index) + ['Other']
    sizes_female = list(top_female.values) + [other_female]
else:
    labels_female = list(counts_female.index)
    sizes_female = list(counts_female.values)
# Convert sizes to percentages of the total
total_female = sum(sizes_female)
percent_sizes_female = [s / total_female * 100 for s in sizes_female]
fig, ax = plt.subplots(figsize=(8, 8))  
# Explode the largest slice slightly for emphasis
# explode = [0.05 if i == 0 else 0 for i in range(len(sizes))]
wedges, texts, autotexts = ax.pie(sizes_female, labels=labels_female, autopct='%1.1f%%', startangle=90, pctdistance=0.8)
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Female job titles (% of total)')
# Improve text size 
plt.setp(autotexts, size=10, weight='bold')
plt.setp(texts, size=9)
plt.show()

# END