In [18]:
import pandas as pd
import glob
import numpy as np

## read files

In [21]:
#read interim frame
i_path = '../../data/raw/articles/2019/query_results/content_quality/per_wiki/'
all_interim_files = glob.glob(i_path + "/*.csv")

li = []
for filename in all_interim_files:
    z = pd.read_csv(filename, index_col=None, header=0)
    li.append(z)
interim_frame = pd.concat(li, axis=0, ignore_index=True)

In [40]:
#read full frame
#f_path = r'../../data/processed/query_results/content_quality/per_wiki/' 
f_path = r'../../data/raw/articles/2019/query_results/content_quality/per_wiki_full/'
all_final_files = glob.glob(f_path + "/*.csv")
tu = []
for filename in all_final_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    tu.append(df)
final_frame = pd.concat(tu, axis=0, ignore_index=True)

# Assess collected data

## Interim

In [22]:
#remove faulty column
del interim_frame['views_1m']

In [23]:
#interim_frame['at_edits'] = interim_frame['at_edits'].astype('bool')

In [24]:
interim_frame.article_type.unique()

array(['new', 'expanded'], dtype=object)

In [25]:
interim_frame.duplicated(subset=['page_id', 'wikicode', 'page_title'], keep=False).sum()

3459

In [26]:
#drop dupes
interim_frame = interim_frame.drop_duplicates(subset=['page_id', 'wikicode', 'page_title'], keep='first')

#### translation column to boolean

In [27]:
interim_frame['at_edits'].unique()

array(['0', "['contenttranslation', 'contenttranslation-v2']",
       "['contenttranslation']",
       "['mw-removed-redirect', 'contenttranslation', 'contenttranslation-v2']",
       0,
       "['انگریزی عنوان', 'contenttranslation', 'contenttranslation-v2']",
       "['campaign-external-machine-translation', 'contenttranslation', 'contenttranslation-v2']",
       "['अनावश्यक nowiki टॅग', 'contenttranslation', 'contenttranslation-v2']"],
      dtype=object)

In [28]:
interim_frame.at_create.unique()

array([0])

In [29]:
#Are all values zero?
(interim_frame['at_create'] == 0).all()

True

In [30]:
del interim_frame['at_create']

In [31]:
interim_frame['translation_tool'] = np.where(interim_frame.at_edits.str.len()>1, True, False)

In [32]:
#create URL, article info, and page info URL columns
interim_frame['url'] = interim_frame.wikicode.replace({'wiki':'.wiki'}, regex=True)

interim_frame['url_article_info'] = 'https://xtools.wmflabs.org/articleinfo/'+interim_frame['url']
interim_frame['url_article_info'] = interim_frame['url_article_info']+ 'pedia.org/' 
interim_frame['url_article_info'] = interim_frame['url_article_info'] +interim_frame['page_title'] 

interim_frame['url'] = 'https://' + interim_frame['url'] +'pedia.org/wiki/' 
interim_frame['url'] = interim_frame['url'] + interim_frame['page_title']

#interim_frame['url_page_info'] = interim_frame['url'] + '&action=info'

In [None]:
interim_frame['QID'] = interim_frame['QID'].astype(int)
interim_frame['QID'] = 'Q' + interim_frame['QID'].astype(str)

In [38]:
interim_frame.to_csv("../../data/processed/query_results/content_quality/1_interim_frame_updated.csv", sep=',', encoding = 'utf-8', index=False) 

## Full frame

In [42]:
#look for dupes
final_frame.duplicated(subset=['page_id', 'wikicode', 'page_title'], keep=False).sum()

3

In [52]:
final_frame.duplicated(subset=['page_id', 'wikicode'], keep='first').sum()

2

In [43]:
final_frame_dupes = pd.concat(g for _, g in final_frame.groupby(['page_id', 'wikicode']) if len(g) > 1)

In [44]:
#identify potential wikis whose articles are the ones duplicated
len(final_frame_dupes.wikicode.unique())

1

In [45]:
final_frame_dupes.wikicode.unique()

array(['mrwiki'], dtype=object)

In [47]:
#ff_pa = pd.read_csv('../../data/raw/articles/2019/query_results/content_quality/per_wiki_full/pa_articles.csv')

In [48]:
ff_mr = pd.read_csv('../../data/raw/articles/2019/query_results/content_quality/per_wiki_full/mr_articles.csv')

In [49]:
len(ff_mr)

122

In [51]:
#check to see that this mirrors or is in line with the number of approved articles
#len(ff_pa)

## clean

In [54]:
#remove faulty column data
del final_frame['views_1m']

In [55]:
#drop dupes
final_frame = final_frame.drop_duplicates(subset=['page_id', 'wikicode', 'page_title'], keep='first')

In [56]:
#check for dupes again to make sure the above worked
final_frame.duplicated(subset=['page_id', 'wikicode', 'page_title'], keep=False).sum()

0

In [57]:
final_frame.article_type.unique()

array(['new', 'expanded'], dtype=object)

In [58]:
#remove expanded rows
final_frame_updated = final_frame[~final_frame.article_type.str.contains("expanded")]

In [59]:
final_frame_updated.article_type.unique()

array(['new'], dtype=object)

In [60]:
final_frame_updated.isna().sum()

wikicode                    0
page_id                     0
page_title                  0
page_len                    0
relative_page_len           0
at_edits                    0
at_create                   0
QID                         0
first_edited                0
article_type                0
editors_1stM                0
edits_1M                    0
num_edits_all_time          0
minor_edits_all_time        0
last_edited                 0
all_editors_of_all_edits    0
editors_nm                  0
IP_editors                  0
micro_editors               0
talk_page_edits             0
watch_count                 0
revertrate                  0
oplinks                     0
oelinks                     0
ipl_count                   0
iwsitelinks                 0
iwsites                     0
dtype: int64

## translation column to boolean

In [78]:
final_frame_updated['at_edits'].unique()

array(['0', "['contenttranslation', 'contenttranslation-v2']",
       "['contenttranslation']",
       "['अनावश्यक nowiki टॅग', 'contenttranslation', 'contenttranslation-v2']",
       "['campaign-external-machine-translation', 'contenttranslation', 'contenttranslation-v2']",
       0,
       "['انگریزی عنوان', 'contenttranslation', 'contenttranslation-v2']"],
      dtype=object)

In [79]:
final_frame_updated.at_create.unique()

array([0])

In [80]:
#Are all values zero?
(final_frame_updated['at_create'] == 0).all()

True

In [81]:
final_frame_updated['translation_tool'] = np.where(final_frame_updated.at_edits.str.len()>1, True, False)

In [82]:
#create URL, article info, and page info URL columns
final_frame_updated['url'] = final_frame_updated_pv.wikicode.replace({'wiki':'.wiki'}, regex=True)

final_frame_updated['url_article_info'] = 'https://xtools.wmflabs.org/articleinfo/'+final_frame_updated['url']
final_frame_updated['url_article_info'] = final_frame_updated['url_article_info']+ 'pedia.org/' 
final_frame_updated['url_article_info'] = final_frame_updated['url_article_info'] +final_frame_updated['page_title'] 

final_frame_updated['url'] = 'https://' + final_frame_updated['url'] +'pedia.org/wiki/' 
final_frame_updated['url'] = final_frame_updated['url'] + final_frame_updated['page_title']

#final_frame_updated_pv['url_page_info'] = final_frame_updated_pv['url'] + '&action=info'


In [83]:
cols_u = [
'wikicode',
'page_title',
'url',
'url_article_info',
'page_id',
'at_edits',
'translation_tool',
'QID',    
'iwsitelinks',
'iwsites', 
'article_type',    
'page_len',
'relative_page_len',
'num_edits_all_time',
'all_editors_of_all_edits',  
#'views_1M', 
#'views_1M_all',    
'editors_1stM',
'edits_1M',
'minor_edits_all_time',
'editors_nm',
'IP_editors',
'micro_editors',
'revertrate',    
'first_edited',
'last_edited',
'talk_page_edits',
'watch_count',
'oplinks',
'oelinks',
'ipl_count',
]

In [84]:
#reorder df
final_frame_updated = final_frame_updated[cols_u]

In [85]:
#rename columns

final_frame_updated.rename(columns={
                                    'num_edits_all_time':'total_edits', 
                                    'all_editors_of_all_edits':'editors',
                                    'minor_edits_all_time': 'minor_edits',
                                    'editors_nm': 'major_editors',
                                    'oelinks': 'o_external_links',
                                    'ipl_count':'incoming_links',
                                    'oplinks':'o_internal_links'
                                    }, 
                                    inplace = True)

In [89]:
final_frame_updated['QID'] = final_frame_updated['QID'].astype(int)
final_frame_updated['QID'] = 'Q' + final_frame_updated['QID'].astype(str)

# to csv

In [93]:
final_frame_updated.to_csv("../../data/processed/query_results/content_quality/b1_final_frame_updated.csv", sep=',', encoding = 'utf-8', index=False)

In [None]:
# locate a page_id for a manual check of page to see that the numbers make sense
#https://pa.wikipedia.org/w/index.php?title=ਸਕਾਰਲੈਟ_ਜੋਹਾਨਸਨ&action=history
#http://pa.wikipedia.org/wiki/ਸਕਾਰਲੈਟ_ਜੋਹਾਨਸਨ
#https://xtools.wmflabs.org/articleinfo/pa.wikipedia.org/ਸਕਾਰਲੈਟ_ਜੋਹਾਨਸਨ

In [None]:
#review incoming pagelinks