In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
import warnings

In [3]:
warnings.filterwarnings("ignore")

# Import and Prepare Data
The finch dataset is stored in a .csv file named "finchData.csv". In this activity, you will read the data into a pandas data frame, do a little processing, and extract the data for the scandens and fortis finch species.

You can use the read_csv function to read the data from a file into a pandas data frame.

df = pd.read_csv(fileName)

In [None]:
finches = pd.read_csv("/Volumes/TOLG/2024_spring/1206_data_sets/finchData.csv")

finches.head()

In [None]:
finches.isna().sum().sort_values()

In [None]:
finches.info()

In [None]:
finches.shape

In [None]:
finches.shape[1]

In [None]:
finches.size

In [None]:
finches.nunique()

# Changing Dtypes

Convert 'Species' to categorical variable, and and 'Band'and 'Year' to type str.

In [None]:
finches['Species'] = finches['Species'].astype('category')

finches['Band'] = finches['Band'].astype('str')

finches['Year'] = finches['Year'].astype('category')

In [None]:
finches.info()

In [None]:
finches.describe(include='all').T

In [None]:
finches.describe().T

In [None]:
finches.groupby("Species").size()

In [None]:
finches.groupby("Year").size()

# Different data extracting methods

There are about half as many finches in 2012 than in 1975. Does this occur for both finch species?

To investigate, create a separate table for the scandens finch species. You can select data according to a condition using relational operators such as == and the query method of data frames.

In [None]:
finches['Species'] == 'scandens'

In [None]:
scandens = finches.loc[finches['Species'] == 'scandens']
scandens

In [None]:
fortis = finches.loc[finches['Species'] == 'fortis']# df.loc[list of row labels, list of column labels]
fortis

In [None]:
scandens.describe().T

In [None]:
scandens.groupby("Year").size()

In [None]:
fortis = finches.query('Species == "fortis"')

In [None]:
fortis.describe().T

In [None]:
fortis.groupby("Year").size()

In [None]:
# A scatter plot explores how two variables are related to each other.
# You can use seaborn's scatterplot method to create a scatter plot.

sns.set(style="ticks")

sns.scatterplot(x='BeakLength', y='BeakDepth', data=finches)

plt.xlabel('Beak Length')
plt.ylabel('Beak Depth')
plt.title("Scatter Plot");

Notice that there appear to be two "groups" in the plot. You can visualize different groups in a scatter plot by assigning each group a different color.

In [None]:
sns.set(style="ticks")

sns.scatterplot(x='BeakLength', y='BeakDepth', hue='Species', data=finches)

plt.xlabel('Beak Length')
plt.ylabel('Beak Depth');


In [None]:
sns.set(style="ticks")

sns.scatterplot(x='BeakLength', y='BeakDepth', hue='Species', style = 'Year', data=finches)

plt.xlabel('Beak Length')
plt.ylabel('Beak Depth');

#plt.legend(loc="upper left");


# Histogram

In the previous activity, you saw the relationship between the finches' beak length and depth measurements in a scatter plot. Let's now look at the distribution of the beak length measurements with a histogram. A histogram provides a quick visual insight into how a data set is distributed. The range of possible values is divided into intervals, or bins, and each bar shows how many observations fall within that interval.

In [None]:
# Create a histogram of the beak length measurements in data using the seaborn's histplot method

# styles to choose from: 'darkgrid', 'whitegrid', 'dark', 'white', 'ticks'

sns.set(style="ticks")

binwidth = .75

lower = min(finches['BeakLength'])
upper = max(finches['BeakLength'])

bins_finches = np.arange(lower, upper+1, binwidth)

sns.histplot(finches["BeakLength"], 
             bins= bins_finches, 
             kde=False, 
             color='skyblue')

xtick_labels = [f'{label:.2f}' for label in bins_finches]
plt.xticks(bins_finches, xtick_labels)
plt.xticks(rotation=30)
plt.xlabel("Beak Length")
plt.ylabel("Frequency")
plt.title("Histogram");

This histogram appears to have two peaks, so we can say this data is bimodal. The data plotted is for the two species. Is that why there are two peaks in the graph?

# Creating histograms side by side

Create a plot with two histograms in the same axes, one for the scandens beak length measurements and one for the fortis beak length measurements.

In [None]:
sns.set(style="ticks")

binwidth = .75

lower = min(finches['BeakLength'])
upper = max(finches['BeakLength'])

bins_finches = np.arange(lower, upper+1, binwidth)

sns.histplot(scandens['BeakLength'], bins=bins_finches, kde=False, label='scandens', color='blue')

sns.histplot(fortis['BeakLength'], bins=bins_finches, kde=False, label='fortis', color='orange')

xtick_labels = [f'{label:.2f}' for label in bins_finches]
plt.xticks(bins_finches, xtick_labels)
plt.xticks(rotation=90)
plt.xlabel('Beak Length')
plt.ylabel('Frequency')
plt.title('Histogram of Beak Lengths for scandens and fortis')
plt.legend();

It's no surprise that beak length measurements are different for the two species. Specifically, they are larger for the scandens species than for the fortis species. The scandens, or cactus finch, primarily feeds on cactus flowers, fruits and seeds, which require a longer beak to reach. In contrast, the fortis, or ground finch, feeds on seeds, which require a shorter but wider beak to break them open.

# Normalizing Data Sets

There are more fortis than scandens observations, which makes it seem like the fortis data is more significant than the scandens data. You can remove this bias by normalizing your data so that instead of looking at the total number of observations, you look at the relative number of observations within each bin.

In [None]:
binwidth = .75

lower = min(finches['BeakLength'])
upper = max(finches['BeakLength'])

bins_finches = np.arange(lower, upper+1, binwidth)

sns.histplot(scandens['BeakLength'], 
             bins=bins_finches, 
             kde=False, 
             label='scandens', 
             color='blue', 
             stat='density')

sns.histplot(fortis['BeakLength'], 
             bins=bins_finches, 
             kde=False, 
             label='fortis', 
             color='orange',  
             stat='density')

xtick_labels = [f'{label:.2f}' for label in bins_finches]
plt.xticks(bins_finches, xtick_labels)
plt.xticks(rotation=30)
plt.xlabel('Beak Length')
plt.ylabel('Relative Frequency')
plt.title('Histogram of Beak Lengths for scandens and fortis')
plt.legend();

# It's clear that the scandens and fortis have different beak lengths.

### Question: Is the scandens beak length changed(evolved) between 1975 and 2012? If there is a change, is it statistically significant?

Try creating two histograms for the scandens' beak length measurements, one for 1975 and another for 2012.

In [None]:
binwidth = 0.50

sc1975 = scandens.query('Year == "1975"')['BeakLength']

sc2012 = scandens.query('Year == "2012"')['BeakLength']

bins_sc = np.arange(min(scandens['BeakLength']),max(scandens['BeakLength'])+1, binwidth)

xtick_labels = [f'{label:.2f}' for label in bins_sc]

plt.xticks(bins_sc, xtick_labels)

plt.xticks(rotation=90)

sns.histplot(sc1975, bins=bins_sc, kde=False, label ='scandens-1975', color='blue', stat='density')

sns.histplot(sc2012, bins=bins_sc, kde=False, label='scandens-2012', color='orange', stat='density')

plt.xlabel('Beak Length')
plt.ylabel('Relative Frequency')
plt.title('BeakLength comparison of scandens in 1975 and scandens in 2012')
plt.legend();

From the histograms of the scandens' beak length measurements, it appears that the length in 1975 is centered on approximately 14.12 mm, whereas the length in 2012 is centered on approximately 13.42 mm. Is this difference statisticaly significant?

In [None]:
print(f"Mean beak lengthe of scandens in 1975 = {sc1975.mean()}\n")
print(f"Mean beak lengthe of scandens in 2012 = {sc2012.mean()}")

# Boxplot

Use seaborn to make a boxplot of the finches data set

In [None]:
ax = sns.boxplot(data=finches, 
                 x='Species', 
                 y='BeakLength', 
                 hue='Year')

In [None]:
g = sns.catplot(data=finches, 
                kind='box', 
                x='Species', 
                y='BeakLength', 
                hue='Year')

## Making a horizontal/vertical boxplots for all numerical variables with no label(category)

orient = "v" -- vertical boxplot(default),  

orient = "h" -- horizontal boxplot

In [None]:
sns.boxplot(np.random.normal(39,2,1000));

In [None]:
sns.boxplot(np.random.normal(39,2,1000), orient="h");

In [None]:
n=1000

rand_dict = {"age":list(np.random.binomial(100, 0.25, n)),
             "weight":list(np.around(np.random.uniform(low=100 , high=200, size=n),2)),
             "height":list(np.around(np.random.normal(70, 12.0, size=n),2))}

In [None]:
rand_data = pd.DataFrame(rand_dict)
rand_data

In [None]:
sns.boxplot(data=rand_data, orient="h");

In [None]:
sns.boxplot(data=rand_data, orient="v");

In [None]:
sns.boxplot(data=rand_data[['age', 'height']]);

In [None]:
# You can also specify x=var_name, orient="h" or y=var_name with orient = "v"

sns.boxplot(data=rand_data, y='weight', orient="v");

In [4]:
brexit = pd.read_csv("BREXIT.csv")

In [5]:
brexit

Unnamed: 0.1,Unnamed: 0,intended vote,education,age
0,0,leave,3.0,60
1,1,leave,,56
2,2,stay,5.0,73
3,3,leave,4.0,64
4,4,don't know,2.0,68
...,...,...,...,...
30890,30890,stay,4.0,21
30891,30891,leave,,18
30892,30892,stay,,61
30893,30893,stay,3.0,18


In [6]:
brexit["intended vote"].value_counts()

intended vote
stay          14352
leave         13692
don't know     2314
won't vote      537
Name: count, dtype: int64

In [7]:
brexit.isna().sum().sort_values()

Unnamed: 0          0
intended vote       0
age                 0
education        3425
dtype: int64

In [8]:
brexit["leave"] = brexit["intended vote"].replace({"leave": 1, "stay": 0, "don\'t know": np.nan, "won\'t vote": np.nan})

In [9]:
brexit

Unnamed: 0.1,Unnamed: 0,intended vote,education,age,leave
0,0,leave,3.0,60,1.0
1,1,leave,,56,1.0
2,2,stay,5.0,73,0.0
3,3,leave,4.0,64,1.0
4,4,don't know,2.0,68,
...,...,...,...,...,...
30890,30890,stay,4.0,21,0.0
30891,30891,leave,,18,1.0
30892,30892,stay,,61,0.0
30893,30893,stay,3.0,18,0.0


In [10]:
brexit.groupby("leave", dropna=False)["age"].mean()

leave
0.0    47.248815
1.0    55.254674
NaN    46.743949
Name: age, dtype: float64

In [11]:
brexit.isna().sum().sort_values()

Unnamed: 0          0
intended vote       0
age                 0
leave            2851
education        3425
dtype: int64

In [12]:
brexit.pivot_table(index="leave", 
                   columns="education", 
                   values="age", 
                   dropna=False,
                   aggfunc="mean", 
                   margins="True")

education,1.0,2.0,3.0,4.0,5.0,NaN,All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,61.02008,53.175269,41.129728,46.902319,46.452055,51.580146,47.248815
1.0,62.813422,56.915289,51.630168,53.662702,51.562599,56.448891,55.254674
,57.937173,48.915873,42.343805,45.801724,45.898204,46.579498,
All,61.921271,54.902958,45.73581,49.214125,47.613872,,50.750251


In [13]:
brexit['leave'].mean(skipna=True)

0.48823277706461277

In [14]:
brexit['leave'].sum (skipna=True)/brexit.shape[0]

0.4431785078491665

In [15]:
brexit.pivot_table(index="leave", 
                   columns="education", 
                   values='intended vote', 
                   aggfunc='count', 
                   dropna=True, 
                   margins=True)

education,1.0,2.0,3.0,4.0,5.0,All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,498,1763,3014,6081,1898,13254
1.0,1356,3388,2685,3783,631,11843
All,1854,5151,5699,9864,2529,25097


In [16]:
brexit.pivot_table(index="leave", 
                   columns="education", 
                   values='age', 
                   aggfunc='count', 
                   dropna=False, 
                   margins=True)

education,1.0,2.0,3.0,4.0,5.0,NaN,All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,498,1763,3014,6081,1898,1098.0,14352.0
1.0,1356,3388,2685,3783,631,1849.0,13692.0
,191,630,573,812,167,478.0,
All,2045,5781,6272,10676,2696,,30895.0


## Contingency Tables

A useful way to summarize two categorical variables is a contingency table -- a table of counts by category. We can also use the _pandas_ `cut` method to split the dataset into bins. There are a number of arguments for the cut method. The following code creates an age group for the BREXIT data set. The method `value_counts` returns a frequency table.

In [17]:
# num_bins = pd.cut(brexit['age'], 25)

num_bins = pd.cut(brexit['age'], [0, 30, 55, 70, 80, 90, 100, 120])

In [18]:
num_bins

0        (55, 70]
1        (55, 70]
2        (70, 80]
3        (55, 70]
4        (55, 70]
           ...   
30890     (0, 30]
30891     (0, 30]
30892    (55, 70]
30893     (0, 30]
30894     (0, 30]
Name: age, Length: 30895, dtype: category
Categories (7, interval[int64, right]): [(0, 30] < (30, 55] < (55, 70] < (70, 80] < (80, 90] < (90, 100] < (100, 120]]

In [19]:
print(num_bins.value_counts().sort_index()) # This is a frequency table for the Series object num_bins

age
(0, 30]        4844
(30, 55]      11933
(55, 70]      11057
(70, 80]       2762
(80, 90]        295
(90, 100]         4
(100, 120]        0
Name: count, dtype: int64


In [20]:
num_bins.name = 'AgeGroup' # This line of code assigns a name to the values of a Series object
df = pd.concat([brexit, num_bins], axis=1)
df.sort_values(by='age', inplace=True)
df

Unnamed: 0.1,Unnamed: 0,intended vote,education,age,leave,AgeGroup
29376,29376,leave,1.0,18,1.0,"(0, 30]"
27005,27005,stay,2.0,18,0.0,"(0, 30]"
28444,28444,stay,3.0,18,0.0,"(0, 30]"
28468,28468,leave,3.0,18,1.0,"(0, 30]"
13900,13900,stay,3.0,18,0.0,"(0, 30]"
...,...,...,...,...,...,...
29885,29885,leave,1.0,90,1.0,"(80, 90]"
18018,18018,don't know,1.0,91,,"(90, 100]"
9395,9395,stay,3.0,91,0.0,"(90, 100]"
8616,8616,leave,3.0,95,1.0,"(90, 100]"


In [21]:
pd.crosstab(df['leave'], df['AgeGroup'], dropna=True, margins=True)

AgeGroup,"(0, 30]","(30, 55]","(55, 70]","(70, 80]","(80, 90]","(90, 100]",All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,3250,5615,4382,989,115,1,14352
1.0,1086,4918,5877,1645,164,2,13692
All,4336,10533,10259,2634,279,3,28044


In [22]:
pd.crosstab(df['leave'], df['AgeGroup'], dropna=False, margins=True)

AgeGroup,"(0, 30]","(30, 55]","(55, 70]","(70, 80]","(80, 90]","(90, 100]","(100, 120]",All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,508,1400,798,128,16,1,0,
0.0,3250,5615,4382,989,115,1,0,14352.0
1.0,1086,4918,5877,1645,164,2,0,13692.0
All,4844,11933,11057,2762,295,4,0,30895.0


In [23]:
freq_drop = pd.crosstab(df['leave'], df['AgeGroup'], dropna=True, margins=True)
rel_freq_drop = freq_drop/freq_drop.loc['All','All']
rel_freq_drop

AgeGroup,"(0, 30]","(30, 55]","(55, 70]","(70, 80]","(80, 90]","(90, 100]",All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,0.115889,0.200221,0.156254,0.035266,0.004101,3.6e-05,0.511767
1.0,0.038725,0.175367,0.209564,0.058658,0.005848,7.1e-05,0.488233
All,0.154614,0.375588,0.365818,0.093924,0.009949,0.000107,1.0


In [24]:
freq_nodrop = pd.crosstab(df['leave'], df['AgeGroup'], dropna=False, margins=True)
rel_freq_nodrop = freq_nodrop/freq_nodrop.loc['All','All']
rel_freq_nodrop

AgeGroup,"(0, 30]","(30, 55]","(55, 70]","(70, 80]","(80, 90]","(90, 100]","(100, 120]",All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,0.016443,0.045315,0.025829,0.004143,0.000518,3.2e-05,0.0,
0.0,0.105195,0.181745,0.141835,0.032012,0.003722,3.2e-05,0.0,0.464541
1.0,0.035151,0.159184,0.190225,0.053245,0.005308,6.5e-05,0.0,0.443179
All,0.156789,0.386244,0.35789,0.0894,0.009548,0.000129,0.0,1.0


In [25]:
pd.crosstab(df['leave'], df['AgeGroup'], dropna=True, margins=True)

AgeGroup,"(0, 30]","(30, 55]","(55, 70]","(70, 80]","(80, 90]","(90, 100]",All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,3250,5615,4382,989,115,1,14352
1.0,1086,4918,5877,1645,164,2,13692
All,4336,10533,10259,2634,279,3,28044


In [26]:
freq_table = pd.crosstab(brexit['leave'], brexit['education'], dropna=False, margins=True)
freq_table = freq_table/freq_table.loc['All','All']
freq_table

education,1.0,2.0,3.0,4.0,5.0,NaN,All
leave,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,0.016119,0.057064,0.097556,0.196828,0.061434,0.03554,0.464541
1.0,0.043891,0.109662,0.086907,0.122447,0.020424,0.059848,0.443179
,0.006182,0.020392,0.018547,0.026283,0.005405,0.015472,
All,0.066192,0.187118,0.20301,0.345558,0.087263,0.0,1.0
