# Final jupyter notebook to display the resulting tables as required

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('wp_scored_city_articles_by_state.csv')

In [3]:
df.drop('Unnamed: 0', axis = 1 , inplace = True)

In [4]:
df.head()

Unnamed: 0,title,State,rev_id,Prediction,population,DIVISION
0,"Abbeville, Alabama",Alabama,1171164000.0,C,5074296.0,East South Central
1,"Adamsville, Alabama",Alabama,1177621000.0,C,5074296.0,East South Central
2,"Addison, Alabama",Alabama,1168360000.0,C,5074296.0,East South Central
3,"Akron, Alabama",Alabama,1165910000.0,GA,5074296.0,East South Central
4,"Alabaster, Alabama",Alabama,1179140000.0,C,5074296.0,East South Central


# RESULT 1

In [5]:
# Group the DataFrame by 'State' and count the number of titles for each state
state_title_counts = df.groupby('State')['title'].count().reset_index()

# Rename the columns for clarity
state_title_counts.columns = ['State', 'Total Titles']

# Display the result
state_title_counts.head()

Unnamed: 0,State,Total Titles
0,Alabama,461
1,Alaska,148
2,Arizona,91
3,Arkansas,500
4,California,479


In [6]:
unique_pop_state_pairs = df[['population', 'State']].drop_duplicates()

df_merged = pd.merge(state_title_counts, unique_pop_state_pairs, left_on = state_title_counts['State'],  
                  right_on = unique_pop_state_pairs['State'], how='left')
df_merged.drop(['key_0', 'State_y'], axis =1 , inplace = True)
df_merged['articles per capita'] = df_merged['Total Titles'] / df_merged['population']
df_merged = df_merged.sort_values(by='articles per capita' , ascending = False).reset_index(drop = True)
print('Top 10 US states by coverage: The 10 US states with the highest total articles per capita (in descending order)')
df_merged[0:10]

Top 10 US states by coverage: The 10 US states with the highest total articles per capita (in descending order)


Unnamed: 0,State_x,Total Titles,population,articles per capita
0,South Dakota,311,909824.0,0.000342
1,Iowa,1042,3200517.0,0.000326
2,Alaska,148,733583.0,0.000202
3,Pennsylvania,2554,12972008.0,0.000197
4,New Hampshire,234,1395231.0,0.000168
5,Arkansas,500,3045637.0,0.000164
6,Missouri,949,6177957.0,0.000154
7,Minnesota,853,5717184.0,0.000149
8,Michigan,1414,10034113.0,0.000141
9,Montana,128,1122867.0,0.000114


# RESULT 2

In [7]:
print('Bottom 10 US states by coverage: The 10 US states with the lowest total articles per capita (in ascending order) .')
df_merged[24:35].sort_values(by = 'articles per capita').reset_index(drop = True)

Bottom 10 US states by coverage: The 10 US states with the lowest total articles per capita (in ascending order) .


Unnamed: 0,State_x,Total Titles,population,articles per capita
0,Nevada,18,3177772.0,6e-06
1,California,479,39029342.0,1.2e-05
2,Arizona,91,7359197.0,1.2e-05
3,Florida,410,22244823.0,1.8e-05
4,Kansas,63,2937150.0,2.1e-05
5,New York,426,19677151.0,2.2e-05
6,Utah,95,3380800.0,2.8e-05
7,Rhode Island,39,1093734.0,3.6e-05
8,Oregon,161,4240137.0,3.8e-05
9,Texas,1221,30029572.0,4.1e-05


# RESULT 3

In [8]:
# Define a function to map Prediction values to quality
def map_quality(prediction):
    if prediction in ['FA', 'GA']:
        return 'High Quality'
    else:
        return 'Average'

# Apply the map_quality function to create a new 'Quality' column
df['Quality'] = df['Prediction'].apply(lambda x: map_quality(x))

In [9]:
df.head()

Unnamed: 0,title,State,rev_id,Prediction,population,DIVISION,Quality
0,"Abbeville, Alabama",Alabama,1171164000.0,C,5074296.0,East South Central,Average
1,"Adamsville, Alabama",Alabama,1177621000.0,C,5074296.0,East South Central,Average
2,"Addison, Alabama",Alabama,1168360000.0,C,5074296.0,East South Central,Average
3,"Akron, Alabama",Alabama,1165910000.0,GA,5074296.0,East South Central,High Quality
4,"Alabaster, Alabama",Alabama,1179140000.0,C,5074296.0,East South Central,Average


In [10]:
# Group the DataFrame by 'State' and count high quality and average titles
state_quality_counts = df.groupby('State')['Quality'].value_counts().unstack(fill_value=0)

# Display the result
state_quality_counts.head()

# Reset the index into a regular column
state_quality_counts.reset_index(drop=False, inplace=True)

#state_quality_counts.drop('level_0', axis = 1, inplace = True)
state_quality_counts.columns

Index(['State', 'Average', 'High Quality'], dtype='object', name='Quality')

In [11]:
# Display the DataFrame with the index as a column
state_quality_counts.head()

Quality,State,Average,High Quality
0,Alabama,408,53
1,Alaska,117,31
2,Arizona,67,24
3,Arkansas,428,72
4,California,307,172


In [12]:
unique_pop_state_pairs = df[['population', 'State']].drop_duplicates()

df_merged_2 = pd.merge(state_quality_counts, unique_pop_state_pairs, left_on = state_quality_counts['State'],  
                  right_on = unique_pop_state_pairs['State'], how='left')
df_merged_2.drop(['key_0', 'State_y'], axis = 1, inplace = True)

In [13]:
df_merged_2.head()

Unnamed: 0,State_x,Average,High Quality,population
0,Alabama,408,53,5074296.0
1,Alaska,117,31,733583.0
2,Arizona,67,24,7359197.0
3,Arkansas,428,72,3045637.0
4,California,307,172,39029342.0


In [14]:
df_merged_2['High Quality articles per capita'] = df_merged_2['High Quality'] / df_merged_2['population']
df_merged_2 = df_merged_2.sort_values(by='High Quality articles per capita' , ascending = False).reset_index(drop = True)

print('Top 10 US states by high quality: The 10 US states with the highest high quality articles per capita (in descending order) .')
df_merged_2[0:10]

Top 10 US states by high quality: The 10 US states with the highest high quality articles per capita (in descending order) .


Unnamed: 0,State_x,Average,High Quality,population,High Quality articles per capita
0,South Dakota,255,56,909824.0,6.2e-05
1,Montana,73,55,1122867.0,4.9e-05
2,New Hampshire,171,63,1395231.0,4.5e-05
3,Pennsylvania,1989,565,12972008.0,4.4e-05
4,Missouri,687,262,6177957.0,4.2e-05
5,Alaska,117,31,733583.0,4.2e-05
6,New Jersey,185,379,9261699.0,4.1e-05
7,Iowa,939,103,3200517.0,3.2e-05
8,Minnesota,685,168,5717184.0,2.9e-05
9,Delaware,32,25,1018396.0,2.5e-05


# RESULT 4

In [15]:
print('Bottom 10 US states by high quality: The 10 US states with the lowest high quality articles per capita (in ascending order).')

df_merged_2[24:35].sort_values(by = 'High Quality articles per capita').reset_index(drop = True)



Bottom 10 US states by high quality: The 10 US states with the lowest high quality articles per capita (in ascending order).


Unnamed: 0,State_x,Average,High Quality,population,High Quality articles per capita
0,Nevada,11,7,3177772.0,2e-06
1,Arizona,67,24,7359197.0,3e-06
2,New York,349,77,19677151.0,4e-06
3,California,307,172,39029342.0,4e-06
4,Florida,292,118,22244823.0,5e-06
5,Utah,75,20,3380800.0,6e-06
6,Kansas,41,22,2937150.0,7e-06
7,Georgia,443,93,10912876.0,9e-06
8,Michigan,1316,98,10034113.0,1e-05
9,Alabama,408,53,5074296.0,1e-05


# RESULT 5

In [16]:
# Group the DataFrame by 'DIVISION' and count the number of titles for each state
division_title_counts = df.groupby('DIVISION')['title'].count().reset_index()

# Rename the columns for clarity
division_title_counts.columns = ['DIVISION', 'Total Titles']

# Display the result
division_title_counts.head()

Unnamed: 0,DIVISION,Total Titles
0,East North Central,3276
1,East South Central,1408
2,Middle Atlantic,3544
3,Mountain,925
4,New England,273


In [17]:
unique_pop_state_div_pairs = df[['population', 'State', 'DIVISION']].drop_duplicates()
unique_pop_state_div_pairs.head()

Unnamed: 0,population,State,DIVISION
0,5074296.0,Alabama,East South Central
450,733583.0,Alaska,Pacific
600,7359197.0,Arizona,Mountain
699,3045637.0,Arkansas,West South Central
1199,39029342.0,California,Pacific


In [18]:
division_population = df.groupby('DIVISION')['population'].sum().reset_index()
division_population.head()

Unnamed: 0,DIVISION,population
0,East North Central,34373550000.0
1,East South Central,7016697000.0
2,Middle Atlantic,46736570000.0
3,Mountain,3479670000.0
4,New England,369139700.0


In [19]:
df_merged_3 = pd.merge(division_title_counts, division_population, left_on = division_title_counts['DIVISION'],  
                  right_on = division_population['DIVISION'], how='left')
df_merged_3.drop(['key_0', 'DIVISION_y'], axis = 1, inplace = True)


In [20]:
df_merged_3

Unnamed: 0,DIVISION_x,Total Titles,population
0,East North Central,3276,34373550000.0
1,East South Central,1408,7016697000.0
2,Middle Atlantic,3544,46736570000.0
3,Mountain,925,3479670000.0
4,New England,273,369139700.0
5,Pacific,938,19702320000.0
6,South Atlantic,1274,16459320000.0
7,West North Central,3218,14542570000.0
8,West South Central,1721,38188930000.0


In [21]:
df_merged_3['articles per capita'] = df_merged_3['Total Titles'] / df_merged_3['population']
df_merged_3 = df_merged_3.sort_values(by='articles per capita' , ascending = False).reset_index(drop = True)
print('Census divisions by total coverage: A rank ordered list of US census divisions (in descending order) by total articles per capita.')
df_merged_3

Census divisions by total coverage: A rank ordered list of US census divisions (in descending order) by total articles per capita.


Unnamed: 0,DIVISION_x,Total Titles,population,articles per capita
0,New England,273,369139700.0,7.395574e-07
1,Mountain,925,3479670000.0,2.658298e-07
2,West North Central,3218,14542570000.0,2.212813e-07
3,East South Central,1408,7016697000.0,2.006642e-07
4,East North Central,3276,34373550000.0,9.530585e-08
5,South Atlantic,1274,16459320000.0,7.740295e-08
6,Middle Atlantic,3544,46736570000.0,7.582927e-08
7,Pacific,938,19702320000.0,4.760861e-08
8,West South Central,1721,38188930000.0,4.506542e-08


# RESULT 6

In [22]:
# Group the DataFrame by 'State' and count high quality and average titles
division_quality_counts = df.groupby('DIVISION')['Quality'].value_counts().unstack(fill_value=0)

# Display the result
division_quality_counts.head()

# Reset the index into a regular column
division_quality_counts.reset_index(drop=False, inplace=True)

#state_quality_counts.drop('level_0', axis = 1, inplace = True)
division_quality_counts.columns

Index(['DIVISION', 'Average', 'High Quality'], dtype='object', name='Quality')

In [23]:
division_quality_counts.head()

Quality,DIVISION,Average,High Quality
0,East North Central,2859,417
1,East South Central,1113,295
2,Middle Atlantic,2523,1021
3,Mountain,671,254
4,New England,198,75


In [24]:
df_merged_4 = pd.merge(division_quality_counts, division_population, left_on = division_quality_counts['DIVISION'],  
                  right_on = division_population['DIVISION'], how='left')

In [25]:
df_merged_4['High Quality articles per capita'] = df_merged_4['High Quality'] / df_merged_4['population']
df_merged_4 = df_merged_4.sort_values(by='High Quality articles per capita' , ascending = False).reset_index(drop = True)
df_merged_4.drop(['key_0', 'DIVISION_y'], axis = 1, inplace = True)
print('Census divisions by high quality coverage: Rank ordered list of US census divisions (in descending order) by high quality articles per capita.')
df_merged_4

Census divisions by high quality coverage: Rank ordered list of US census divisions (in descending order) by high quality articles per capita.


Unnamed: 0,DIVISION_x,Average,High Quality,population,High Quality articles per capita
0,New England,198,75,369139700.0,2.031751e-07
1,Mountain,671,254,3479670000.0,7.299544e-08
2,East South Central,1113,295,7016697000.0,4.204257e-08
3,West North Central,2607,611,14542570000.0,4.201457e-08
4,Middle Atlantic,2523,1021,46736570000.0,2.184585e-08
5,South Atlantic,935,339,16459320000.0,2.059623e-08
6,Pacific,602,336,19702320000.0,1.705383e-08
7,West South Central,1163,558,38188930000.0,1.461157e-08
8,East North Central,2859,417,34373550000.0,1.213142e-08
