In [37]:
import pandas as pd
import numpy as np

## Analysis
### 4a) Total Articles per Population (Articles per Capita) by State
In this section, we calculate the total number of Wikipedia articles per capita for each state.

First, we read the CSV file containing the necessary data:

In [38]:
# Reading csv
df_consolidated = pd.read_csv('../data/wp_scored_city_articles_by_state.csv')
df_consolidated

Unnamed: 0,state,regional_division,article_title,revision_id,article_quality,population
0,alabama,East South Central,"Abbeville, Alabama",1171163550,C,5074296.0
1,alabama,East South Central,"Adamsville, Alabama",1177621427,C,5074296.0
2,alabama,East South Central,"Addison, Alabama",1168359898,C,5074296.0
3,alabama,East South Central,"Akron, Alabama",1165909508,GA,5074296.0
4,alabama,East South Central,"Alabaster, Alabama",1179139816,C,5074296.0
...,...,...,...,...,...,...
21503,wyoming,Mountain,"Wright, Wyoming",1166334449,GA,581381.0
21504,wyoming,Mountain,"Yoder, Wyoming",1171182284,C,581381.0
21505,missouri,West North Central,"Jennings, Missouri",1165751175,C,6177957.0
21506,pennsylvania,Middle Atlantic,"Jefferson Township, Greene County, Pennsylvania",1171582274,C,12972008.0


Next, we perform various data processing tasks to calculate the articles per capita for each state. This includes handling duplicates, grouping data, and calculating the articles per capita:



In [39]:
# Removing the duplicates for states, 
# group the states and aggregate population per regional division by counting number of articles, 
# calculate article_per_capita
df1 = df_consolidated[~df_consolidated.duplicated(subset=['state', 'regional_division'], keep = 'last')]

# Calculating the population of each state
state_pop = df1[['state', 'population']].groupby('state').sum().reset_index()
state_article_cnt = df_consolidated[['state', 'article_title']].groupby('state').count().reset_index()
total_articles_state = state_pop.merge(state_article_cnt, on='state')
total_articles_state.columns=['state', 'population', 'article_count']
total_articles_state['article_count'] = total_articles_state['article_count'].astype('int')
total_articles_state['articles_per_capita'] = total_articles_state['article_count'] / (total_articles_state['population'])
total_articles_state['articles_per_capita'] = total_articles_state['articles_per_capita'].astype('float')

# handling for conditions where population is zero (6 states)
total_articles_state = total_articles_state[total_articles_state['articles_per_capita'] != np.inf] 
print('On a state level, the dataframe returns the below number of rows')
print(len(total_articles_state['state'].unique()))
total_articles_state.reset_index(inplace=True)
total_articles_state = total_articles_state.drop('index', axis = 1)
total_articles_state.head()

On a state level, the dataframe returns the below number of rows
37


Unnamed: 0,state,population,article_count,articles_per_capita
0,alabama,5074296.0,461,9.1e-05
1,alaska,733583.0,148,0.000202
2,arizona,7359197.0,91,1.2e-05
3,arkansas,3045637.0,500,0.000164
4,california,39029342.0,482,1.2e-05


We also analyze the data by regional divisions to understand the distribution at a broader level:


In [40]:

division_pop = df_consolidated.drop_duplicates(subset=['state', 'population']).groupby('regional_division')['population'].sum().to_frame().reset_index()

df_pop_division = division_pop
# Resetting the index to make 'regional_division' a column again
# division_pop.reset_index(inplace=True)
division_pop

Unnamed: 0,regional_division,population
0,East North Central,47097779.0
1,East South Central,19578002.0
2,Middle Atlantic,12972008.0
3,Mountain,23400976.0
4,New England,9014378.0
5,Pacific,53229044.0
6,South Atlantic,38111498.0
7,West North Central,18032808.0
8,West South Central,41685250.0


In [41]:
# Repeating the same as above but grouping by regional division in this case
# Calculating the population of each state



division_article_cnt = df_consolidated[['regional_division', 'article_title']].groupby('regional_division').count().reset_index()
total_articles_division = division_pop.merge(division_article_cnt, on='regional_division')
total_articles_division.columns=['regional_division', 'population', 'article_count']
total_articles_division['articles_per_capita'] = total_articles_division['article_count'] / (total_articles_division['population'])
 
print('On a regional division level, the dataframe returns the below number of rows')
print(len(total_articles_division['regional_division'].unique()))
total_articles_division.head()

On a regional division level, the dataframe returns the below number of rows
9


Unnamed: 0,regional_division,population,article_count,articles_per_capita
0,East North Central,47097779.0,4753,0.000101
1,East South Central,19578002.0,1527,7.8e-05
2,Middle Atlantic,12972008.0,2556,0.000197
3,Mountain,23400976.0,1081,4.6e-05
4,New England,9014378.0,1163,0.000129


4b) High Quality Articles per Population
This section focuses on calculating the number of high-quality articles per capita for each state. The dataset is filtered to include only articles tagged with "FA" (Featured Article) or "GA" (Good Article) in the "article_quality" column. Similar to 4a, the analysis is conducted at both the state and regional division levels.

In [42]:
# Filtering the ahttp://localhost:8888/notebooks/Downloads/data%20512%20hw2.ipynb#By-Staterticle based on the artcile_quality attribute
# Calculation for article_count and article_per_capita done the same as above i.e., group by state

df3 = df_consolidated[~df_consolidated.duplicated(subset=['state', 'regional_division'], keep = 'last')]

state_pop = df3[['state', 'population']].groupby('state').sum().reset_index()
hq_state_df = df_consolidated[(df_consolidated['article_quality'] == 
                                 'FA') | (df_consolidated['article_quality'] == 'GA')]

state_count = hq_state_df[['state', 'article_title']].groupby('state').count().reset_index()
hq_state_df = state_pop.merge(state_count, on='state')
hq_state_df.columns=['state', 'population', 'article_count']
hq_state_df['article_count'] = hq_state_df['article_count'].astype('int')
hq_state_df['articles_per_capita'] = hq_state_df['article_count'] / (hq_state_df['population'])
hq_state_df['articles_per_capita'] = hq_state_df['articles_per_capita'].astype('float')

# Need to exclude conditions where the population of a state is zero
hq_state_df = hq_state_df[hq_state_df['articles_per_capita'] != np.inf]
hq_state_df.reset_index(inplace=True)
hq_state_df.drop(columns=['index'], inplace=True)

print('On a state level, the high quality dataframe returns the below number of rows')
print(len(hq_state_df['state'].unique()))
hq_state_df.head()

On a state level, the high quality dataframe returns the below number of rows
37


Unnamed: 0,state,population,article_count,articles_per_capita
0,alabama,5074296.0,53,1e-05
1,alaska,733583.0,31,4.2e-05
2,arizona,7359197.0,24,3e-06
3,arkansas,3045637.0,72,2.4e-05
4,california,39029342.0,173,4e-06


By regional division

In [43]:
# Filtering the article based on the artcile_quality attribute
# Calculation for article_count and article_per_capita done the same as above i.e., group by regional division

division_pop = df_pop_division #Using from step 1

hq_division_df = df_consolidated[(df_consolidated['article_quality'] == 
                                 'FA') | (df_consolidated['article_quality'] == 'GA')]
division_count = hq_division_df[['regional_division', 'article_title']].groupby('regional_division').count().reset_index()
hq_division_df = division_pop.merge(division_count, on='regional_division')
hq_division_df.columns=['regional_division', 'population', 'article_count']
hq_division_df['articles_per_capita'] = hq_division_df['article_count'] / (hq_division_df['population'])

print('On a regional division level, the high quality dataframe returns the below number of rows')
print(len(hq_division_df['regional_division'].unique()))
hq_division_df.head()

On a regional division level, the high quality dataframe returns the below number of rows
9


Unnamed: 0,regional_division,population,article_count,articles_per_capita
0,East North Central,47097779.0,717,1.5e-05
1,East South Central,19578002.0,316,1.6e-05
2,Middle Atlantic,12972008.0,566,4.4e-05
3,Mountain,23400976.0,304,1.3e-05
4,New England,9014378.0,150,1.7e-05


# Step 5: Results
This section presents the results of the analysis in the form of data tables. It includes the top 10 and bottom 10 US states based on total articles per capita and high-quality articles per capita. Furthermore, it provides the rank-ordered list of US census divisions by total articles per capita and high-quality articles per capita.

The code displays the results as lists of states or census divisions, based on the calculated metrics. Adjustments to the dataset or specific data attributes may be required based on the actual data being used.

1. Top 10 US states by coverage: Displays the top 10 US states with the highest total articles per capita in descending order.
2. Bottom 10 US states by coverage: Displays the 10 US states with the lowest total articles per capita in ascending order.
3. Top 10 US states by high quality: Displays the 10 US states with the highest high-quality articles per capita in descending order.
4. Bottom 10 US states by high quality: Displays the 10 US states with the lowest high-quality articles per capita in ascending order.
5. Census divisions by total coverage: Displays a rank-ordered list of US census divisions by total articles per capita in descending order.
6. Census divisions by high quality coverage: Displays a rank-ordered list of US census divisions by high-quality articles per capita in descending order.




### 1. Top 10 US states by coverage: The 10 US states with the highest total articles per capita (in descending order)


In [44]:
top10_state = total_articles_state.sort_values(by=['articles_per_capita'],
                                                    ascending=False).head(10).reset_index()
top10_state.index += 1
top10_state['state']

1          vermont
2            maine
3             iowa
4           alaska
5     pennsylvania
6         michigan
7          wyoming
8         arkansas
9         missouri
10       minnesota
Name: state, dtype: object

### 2. Bottom 10 US states by coverage: The 10 US states with the lowest total articles per capita (in ascending order) 

In [45]:
bottom10_state = total_articles_state.sort_values(by=['articles_per_capita'],
                                                    ascending=True).head(10).reset_index()
bottom10_state.index += 1
bottom10_state['state']

1         nevada
2     california
3        arizona
4       virginia
5        florida
6       oklahoma
7         kansas
8       maryland
9      wisconsin
10    washington
Name: state, dtype: object

### 3. Top 10 US states by high quality: The 10 US states with the highest high quality articles per capita (in descending order)

In [46]:
top10_hq_state = hq_state_df.sort_values(by=['articles_per_capita'],
                                             ascending=False).head(10).reset_index()
top10_hq_state.index += 1
top10_hq_state['state']

1          vermont
2          wyoming
3          montana
4     pennsylvania
5         missouri
6           alaska
7           oregon
8             iowa
9            maine
10       minnesota
Name: state, dtype: object

### 4. Bottom 10 US states by high quality: The 10 US states with the lowest high quality articles per capita (in ascending order)

In [47]:
bottom10_hq_state = hq_state_df.sort_values(by=['articles_per_capita'],
                                             ascending=True).head(10).reset_index()
bottom10_hq_state.index += 1
bottom10_hq_state['state']

1          virginia
2            nevada
3           arizona
4        california
5           florida
6          maryland
7            kansas
8          oklahoma
9     massachusetts
10        louisiana
Name: state, dtype: object

### 5. Census divisions by total coverage: A rank ordered list of US census divisions (in descending order) by total articles per capita

In [48]:
division_coverage = total_articles_division.sort_values(by=['articles_per_capita'],
                                                ascending=False).reset_index()
division_coverage.index += 1
division_coverage['regional_division']

1       Middle Atlantic
2    West North Central
3           New England
4    East North Central
5    East South Central
6    West South Central
7              Mountain
8               Pacific
9        South Atlantic
Name: regional_division, dtype: object

### 6. Census divisions by high quality coverage: Rank ordered list of US census divisions (in descending order) by high quality articles per capita

In [49]:
division_hq_coverage = hq_division_df.sort_values(by=['articles_per_capita'],
                                           ascending=False).reset_index()
division_hq_coverage.index += 1
division_hq_coverage['regional_division']

1       Middle Atlantic
2    West North Central
3           New England
4    East South Central
5    East North Central
6    West South Central
7              Mountain
8               Pacific
9        South Atlantic
Name: regional_division, dtype: object