# Tidy in Pandas 

In [4]:
import pandas as pd 
import numpy as np

df = pd.read_pickle('../data/all_years.pkl')

print(np.shape(df))
df.head()

(963, 18)


In [5]:
# The structure of our data is wide, with separate columns for each month's circulation.
# Let's tidy this data by reshaping it into long format using the melt() function.

df_long = pd.melt(df, 
                  id_vars=['branch', 'address', 'city', 'zip code', 'ytd', 'year'],
                  value_vars=['january', 'february', 'march', 'april', 'may', 'june', 
                              'july', 'august', 'september', 'october', 'november', 'december'],
                  var_name='month', 
                  value_name='circulation')


print(np.shape(df_long))
# Check the structure of the reshaped data
df_long.head()

(11556, 8)


Unnamed: 0,branch,address,city,zip code,ytd,year,month,circulation
0,Albany Park,5150 N. Kimball Ave.,Chicago,60625.0,120059,2011,january,8427
1,Altgeld,13281 S. Corliss Ave.,Chicago,60827.0,9611,2011,january,1258
2,Archer Heights,5055 S. Archer Ave.,Chicago,60632.0,101951,2011,january,8104
3,Austin,5615 W. Race Ave.,Chicago,60644.0,25527,2011,january,1755
4,Austin-Irving,6100 W. Irving Park Rd.,Chicago,60634.0,165634,2011,january,12593


In [6]:
# List all the unique branch names in the reshaped DataFrame
df_long['branch'].unique()

array(['Albany Park', 'Altgeld', 'Archer Heights', 'Austin',
       'Austin-Irving', 'Avalon', 'Back of the Yards', 'Beverly',
       'Bezazian', 'Blackstone', 'Brainerd', 'Brighton Park',
       'Bucktown-Wicker Park', 'Budlong Woods', 'Canaryville',
       'Chicago Bee', 'Chicago Lawn', 'Chinatown', 'Clearing', 'Coleman',
       'Daley, Richard J. - Bridgeport', 'Daley, Richard M. - W Humboldt',
       'Douglass', 'Dunning', 'Edgebrook', 'Edgewater', 'Gage Park',
       'Galewood-Mont Clare', 'Garfield Ridge', 'Greater Grand Crossing',
       'Hall', 'Harold Washington Library Center', 'Hegewisch',
       'Humboldt Park', 'Independence', 'Jefferson Park', 'Jeffery Manor',
       'Kelly', 'King', 'Legler Regional', 'Lincoln Belmont',
       'Lincoln Park', 'Little Village', 'Logan Square', 'Lozano',
       'Manning', 'Mayfair', 'McKinley Park', 'Merlo', 'Mount Greenwood',
       'Near North', 'North Austin', 'North Pulaski', 'Northtown',
       'Oriole Park', 'Portage-Cragin', 'Pullma

In [13]:
import numpy as np

# Filter the rows where the circulation count is greater than 10,000
high_circulation = df_long[df_long['circulation'] > 10000]

# Calculate the total number of rows and the number of high-circulation rows
total_rows = np.shape(df_long)[0]
high_circulation_rows = np.shape(high_circulation)[0]

# Calculate the percentage of high-circulation rows
percentage_high_circulation = (high_circulation_rows / total_rows) * 100

# Print the percentage of high-circulation books
print(f"Percentage of high-circulation books: {percentage_high_circulation:.2f}%")

high_circulation.head()

Percentage of high-circulation books: 12.41%


Unnamed: 0,branch,address,city,zip code,ytd,year,month,circulation
4,Austin-Irving,6100 W. Irving Park Rd.,Chicago,60634.0,165634,2011,january,12593
12,Bucktown-Wicker Park,1701 N. Milwaukee Ave.,Chicago,60647.0,173396,2011,january,13113
13,Budlong Woods,5630 N. Lincoln Ave.,Chicago,60659.0,160271,2011,january,12841
17,Chinatown,2353 S. Wentworth Ave.,Chicago,60616.0,158449,2011,january,14027
24,Edgebrook,5331 W. Devon Ave.,Chicago,60646.0,129288,2011,january,10231


In [16]:
# Let's only display the branch and circulation columns
df_long[['branch', 'circulation']]

Unnamed: 0,branch,circulation
0,Albany Park,8427
1,Altgeld,1258
2,Archer Heights,8104
3,Austin,1755
4,Austin-Irving,12593
...,...,...
11551,Chinatown,3957
11552,Brainerd,201
11553,Brighton Park,1278
11554,South Chicago,615


In [17]:

# Sort the data by circulation in descending order
df_long.sort_values('circulation', ascending=False)


Unnamed: 0,branch,address,city,zip code,ytd,year,month,circulation
1957,Harold Washington Library Center,400 S. State St.,Chicago,60605.0,966720,2011,march,89122
2920,Harold Washington Library Center,400 S. State St.,Chicago,60605.0,966720,2011,april,88527
2999,Harold Washington Library Center,400 S. State St.,Chicago,60605.0,937649,2012,april,87689
6772,Harold Washington Library Center,400 S. State St.,Chicago,60605.0,966720,2011,august,85193
2036,Harold Washington Library Center,400 S. State St.,Chicago,60605.0,937649,2012,march,84255
...,...,...,...,...,...,...,...,...
3623,Portage-Cragin,5108 W. Belmont Ave.,Chicago,60641.0,36262,2020,april,0
3622,Manning,6 S. Hoyne Ave.,Chicago,60612.0,3325,2020,april,0
3621,"Daley, Richard J. - Bridgeport",3400 S. Halsted St.,Chicago,60608.0,37045,2020,april,0
3620,Canaryville,642 W. 43rd St.,Chicago,60609.0,4120,2020,april,0


In [18]:
# Group by branch and aggregate the total and mean circulation for each branch 

df_long.groupby('branch')['circulation'].agg(total_calculations='sum', mean_circulation='mean')


Unnamed: 0_level_0,total_calculations,mean_circulation
branch,Unnamed: 1_level_1,Unnamed: 2_level_1
Albany Park,1024714,7116.069444
Altgeld,68358,474.708333
Archer Heights,803014,5576.486111
Austin,200107,1389.631944
Austin-Irving,1359700,9442.361111
...,...,...
West Pullman,295327,2050.881944
West Town,922876,6408.861111
"Whitney M. Young, Jr.",259680,1803.333333
Woodson Regional,823793,5720.784722


In [19]:
# Group by branch and month, then aggregate the sum and mean of circulation 

df_long.groupby(['branch','month'])['circulation'].agg(['sum', 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,mean
branch,month,Unnamed: 2_level_1,Unnamed: 3_level_1
Albany Park,april,79599,6633.250000
Albany Park,august,91416,7618.000000
Albany Park,december,77849,6487.416667
Albany Park,february,76747,6395.583333
Albany Park,january,85952,7162.666667
...,...,...,...
Wrightwood-Ashburn,march,25817,2151.416667
Wrightwood-Ashburn,may,22049,1837.416667
Wrightwood-Ashburn,november,24124,2010.333333
Wrightwood-Ashburn,october,27345,2278.750000


In [20]:
# Now, create a new 'date' column by combining the 'year' and 'month' columns
df_long['date'] = df_long['year'] + '-' + df_long['month']

In [21]:
# Convert the 'date' column to a datetime object
df_long['date'] = pd.to_datetime(df_long['date'], format='%Y-%B')


In [22]:
# Check the data types of each column to ensure the date conversion worked
df_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11556 entries, 0 to 11555
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   branch       11556 non-null  object        
 1   address      7716 non-null   object        
 2   city         7716 non-null   object        
 3   zip code     7716 non-null   float64       
 4   ytd          11556 non-null  int64         
 5   year         11556 non-null  object        
 6   month        11556 non-null  object        
 7   circulation  11556 non-null  int64         
 8   date         11556 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 812.7+ KB


In [23]:
# Set the 'date' column as the index of the DataFrame
df_long.set_index('date', inplace=True)

In [24]:
# Save the tidied data as a pickle file for future use
df_long.to_pickle('../data/df_long.pkl')