In [1]:
import pandas as pd
import numpy as np
import dateparser
import datetime
import time

# Aggregating stats and users



In [2]:
# Load users data
dfUsers = pd.read_csv("wikia_users.csv", header=0)
dfUsers.drop_duplicates(subset=['url'], inplace=True)
dfUsers.dropna(subset=['url'], inplace=True)
dfUsers.head()

Unnamed: 0,url,users_1,users_5,users_10,users_20,users_50,users_100,bots
0,http://0ad.wikia.com/,38,20,14,9,7,5,8
1,http://0hourmysticknights.wikia.com/,7,6,3,1,1,1,4
2,http://0-xxii.wikia.com/,5,3,3,2,2,1,3
3,http://00fanon.wikia.com/,5,2,1,1,1,1,3
4,http://0002oifos.wikia.com/,3,1,1,1,1,1,2


In [3]:
# Load stats data
dfStats = pd.read_csv("20180220-wikia_stats.csv")
dfStats.drop_duplicates(subset=['url'], inplace=True)
dfStats.head()

Unnamed: 0,domain,hub,id,lang,language,name,stats.activeUsers,stats.admins,stats.articles,stats.discussions,stats.edits,stats.images,stats.pages,stats.users,stats.videos,title,topic,url,wam_score,stats.nonarticles
0,0ad.wikia.com,Games,194794,en,en,0 A.D. Wiki,1,3,101,,2203,138,925,13664281,2,0 A.D. Wiki,Gaming,http://0ad.wikia.com/,2.4465,824
1,0hourmysticknights.wikia.com,Games,1459872,en,en,0 Hour: Mystic Knights Wikia,0,1,22,0.0,277,30,161,15128409,0,0 Hour: Mystic Knights Wikia,Creative,http://0hourmysticknights.wikia.com/,0.0,139
2,0-xxii.wikia.com,Games,685186,en,en,0-XXII Wiki,0,1,34,1.0,454,45,296,10283456,0,0-XXII Wiki,Creative,http://0-xxii.wikia.com/,0.0,262
3,00fanon.wikia.com,TV,350933,en,en,00 Fanon Wiki,1,1,93,1.0,1010,14,350,13002315,0,00 Fanon Wiki,Fanon,http://00fanon.wikia.com/,0.0,257
4,0002oifos.wikia.com,Games,678685,en,en,0002oifos Wiki,0,1,6,0.0,328,56,269,10283456,0,0002oifos Wiki,TV,http://0002oifos.wikia.com/,0.0,263


In [4]:
dfIndex = pd.read_csv("20180220-wikia_CuratedIndex.txt", header=None, names=['url'])
dfIndex.drop_duplicates(inplace=True)

# Merge index and stats in order to identify the wikis without stats according to the Wikia API
mergedStatUserData = pd.merge(dfIndex, dfStats[['url', 'id']], how='left', on=['url'])
mergedStatUserData.head()

Unnamed: 0,url,id
0,http://0ad.wikia.com/,194794.0
1,http://0hourmysticknights.wikia.com/,1459872.0
2,http://0-xxii.wikia.com/,685186.0
3,http://00fanon.wikia.com/,350933.0
4,http://0002oifos.wikia.com/,678685.0


In [5]:
# Merge index, stats and users in order to identify the wikis without stats or without users' information
mergedStatUserData = pd.merge(mergedStatUserData, dfUsers[['url', 'users_1']], how='left', on=['url'])
mergedStatUserData.head()

Unnamed: 0,url,id,users_1
0,http://0ad.wikia.com/,194794.0,38.0
1,http://0hourmysticknights.wikia.com/,1459872.0,7.0
2,http://0-xxii.wikia.com/,685186.0,5.0
3,http://00fanon.wikia.com/,350933.0,5.0
4,http://0002oifos.wikia.com/,678685.0,3.0


In [6]:
print('Wikia Index size: {}'.format(len(dfIndex)))
print('  Wikis with stats: {}'.format(len(mergedStatUserData[~mergedStatUserData['id'].isna()])))
print('  Wikis with number of users: {}'.format(len(mergedStatUserData[~mergedStatUserData['users_1'].isna()])))
print('  Wikis with stats AND number of users: {}'.format(len(mergedStatUserData[(~mergedStatUserData['id'].isna()) & (~mergedStatUserData['users_1'].isna())])))

Wikia Index size: 339453
  Wikis with stats: 339192
  Wikis with number of users: 339103
  Wikis with stats AND number of users: 338949


In [7]:
dfStatsUsers = pd.merge(dfIndex, dfStats, how='left', on=['url'])
dfStatsUsers = pd.merge(dfStatsUsers, dfUsers, how='inner', on=['url'])
dfStatsUsers.dropna(subset=['id'], inplace=True)

dfStatsUsers.head()

Unnamed: 0,url,domain,hub,id,lang,language,name,stats.activeUsers,stats.admins,stats.articles,...,topic,wam_score,stats.nonarticles,users_1,users_5,users_10,users_20,users_50,users_100,bots
0,http://0ad.wikia.com/,0ad.wikia.com,Games,194794.0,en,en,0 A.D. Wiki,1.0,3.0,101.0,...,Gaming,2.4465,824.0,38,20,14,9,7,5,8
1,http://0hourmysticknights.wikia.com/,0hourmysticknights.wikia.com,Games,1459872.0,en,en,0 Hour: Mystic Knights Wikia,0.0,1.0,22.0,...,Creative,0.0,139.0,7,6,3,1,1,1,4
2,http://0-xxii.wikia.com/,0-xxii.wikia.com,Games,685186.0,en,en,0-XXII Wiki,0.0,1.0,34.0,...,Creative,0.0,262.0,5,3,3,2,2,1,3
3,http://00fanon.wikia.com/,00fanon.wikia.com,TV,350933.0,en,en,00 Fanon Wiki,1.0,1.0,93.0,...,Fanon,0.0,257.0,5,2,1,1,1,1,3
4,http://0002oifos.wikia.com/,0002oifos.wikia.com,Games,678685.0,en,en,0002oifos Wiki,0.0,1.0,6.0,...,TV,0.0,263.0,3,1,1,1,1,1,2


In [8]:
# Save to CSV
import time
timestr = time.strftime("%Y%m%d")
dfStatsUsers.to_csv('data/{}-wikia_stats_users.csv'.format(timestr), index=False)

# Aggregating birthdate

In [9]:
# Load birthdate data
dfBirthDate = pd.read_csv("20180220-wikia_birthdate.csv", names = ['url', 'birthDate'], header=0)
dfBirthDate.drop_duplicates(subset=['url'], inplace=True)

# Create a new column with the birthdate in datetime format
dfBirthDate['datetime.birthDate'] = pd.to_datetime(dfBirthDate['birthDate'], infer_datetime_format=True, errors='coerce') 
dfBirthDate.head()

Unnamed: 0,url,birthDate,datetime.birthDate
0,http://0-robloxhelp-0.wikia.com/,"00:34, February 7, 2018",2018-02-07 00:34:00
1,http://0-xxii.wikia.com/,"10:34, February 4, 2013",2013-02-04 10:34:00
2,http://000.wikia.com/,"21:36, April 12, 2009",2009-04-12 21:36:00
3,http://0002oifos.wikia.com/,"16:04, January 24, 2013",2013-01-24 16:04:00
4,http://001.wikia.com/,"19:43, December 19, 2009",2009-12-19 19:43:00


In [10]:
mergedBirthData = pd.merge(dfIndex, dfBirthDate, how='left', on=['url'])
mergedBirthData.head()

Unnamed: 0,url,birthDate,datetime.birthDate
0,http://0ad.wikia.com/,"14:01, December 14, 2010",2010-12-14 14:01:00
1,http://0hourmysticknights.wikia.com/,"02:10, September 13, 2016",2016-09-13 02:10:00
2,http://0-xxii.wikia.com/,"10:34, February 4, 2013",2013-02-04 10:34:00
3,http://00fanon.wikia.com/,"10:18, August 14, 2011",2011-08-14 10:18:00
4,http://0002oifos.wikia.com/,"16:04, January 24, 2013",2013-01-24 16:04:00


In [11]:
# Merge index, stats, users and birthdate in order to identify the wikis that lack from any kind of information
mergedStatUserBirthData = pd.merge(dfStatsUsers, dfBirthDate, how='left', on=['url'])
mergedStatUserBirthData.head()

Unnamed: 0,url,domain,hub,id,lang,language,name,stats.activeUsers,stats.admins,stats.articles,...,stats.nonarticles,users_1,users_5,users_10,users_20,users_50,users_100,bots,birthDate,datetime.birthDate
0,http://0ad.wikia.com/,0ad.wikia.com,Games,194794.0,en,en,0 A.D. Wiki,1.0,3.0,101.0,...,824.0,38,20,14,9,7,5,8,"14:01, December 14, 2010",2010-12-14 14:01:00
1,http://0hourmysticknights.wikia.com/,0hourmysticknights.wikia.com,Games,1459872.0,en,en,0 Hour: Mystic Knights Wikia,0.0,1.0,22.0,...,139.0,7,6,3,1,1,1,4,"02:10, September 13, 2016",2016-09-13 02:10:00
2,http://0-xxii.wikia.com/,0-xxii.wikia.com,Games,685186.0,en,en,0-XXII Wiki,0.0,1.0,34.0,...,262.0,5,3,3,2,2,1,3,"10:34, February 4, 2013",2013-02-04 10:34:00
3,http://00fanon.wikia.com/,00fanon.wikia.com,TV,350933.0,en,en,00 Fanon Wiki,1.0,1.0,93.0,...,257.0,5,2,1,1,1,1,3,"10:18, August 14, 2011",2011-08-14 10:18:00
4,http://0002oifos.wikia.com/,0002oifos.wikia.com,Games,678685.0,en,en,0002oifos Wiki,0.0,1.0,6.0,...,263.0,3,1,1,1,1,1,2,"16:04, January 24, 2013",2013-01-24 16:04:00


In [12]:
print('Wikia Index size: {}'.format(len(dfIndex)))
print('  Wikis with birthDate: {}'.format(len(mergedBirthData[~mergedBirthData['birthDate'].isna()])))
print('  Wikis with stats AND users AND birthDate: {}'.format(len(mergedStatUserBirthData[(~mergedStatUserBirthData['id'].isna()) & (~mergedStatUserBirthData['birthDate'].isna())])))
print('  Wikis with birthDate in correct datetime format: {}'.format(len(mergedStatUserBirthData[~mergedStatUserBirthData['datetime.birthDate'].isna()])))

Wikia Index size: 339453
  Wikis with birthDate: 338439
  Wikis with stats AND users AND birthDate: 338159
  Wikis with birthDate in correct datetime format: 239723


In [13]:
mergedStatUserBirthData.dropna(subset=['birthDate'], inplace=True)
len(mergedStatUserBirthData)

338159

## New parsing of birthdates

Previous stats show that there are around 100k dates in a non valid datetime format. The reason is that the birthdate string is language dependant so some string were not correctly parsed after loading the birthdate data. We will use the [dateparser module](https://dateparser.readthedocs.io/en/v0.3.0/) and the Wiki language in order to parse the non valid birthdates. 

In [14]:
dates = mergedStatUserBirthData[mergedStatUserBirthData['datetime.birthDate'].isna()]['birthDate'].values
languages = mergedStatUserBirthData[mergedStatUserBirthData['datetime.birthDate'].isna()]['lang'].values

i = 0
newDates = {}
for d in dates:
    try:
        newDates[d] = dateparser.parse(d, languages=[languages[i]])
    except Exception:
        newDates[d] = "NONVALID"
    i+=1

In [15]:
nanBirthDates = mergedStatUserBirthData[mergedStatUserBirthData['datetime.birthDate'].isna()].copy()
noNanBirthDates = mergedStatUserBirthData.dropna(subset=['datetime.birthDate'])

# Add new parsed birthdates
nanBirthDates['datetime.birthDate'] = nanBirthDates['birthDate'].map(newDates)

# Remove the birthdates that remain NA
nanBirthDates.dropna(subset=['datetime.birthDate'], inplace=True)

# Remove NONVALID birthdates 
nanBirthDates = nanBirthDates[~nanBirthDates['datetime.birthDate'].isin(['NONVALID'])].copy()

In [16]:
wikiaDataset = pd.concat([noNanBirthDates,nanBirthDates])

## Thailand calendar

Change date for Thailand calendar: [On 6 September 1940, Prime Minister Phibunsongkhram decreed 1 January 1941 as the start of the year 2484 BE, so year 2483 BE had only nine months. To convert dates from 1 January to 31 March prior to that year, the number to add or subtract is 542; otherwise, it is 543.](https://en.wikipedia.org/wiki/Thai_solar_calendar)

In [17]:
def changeCalendar(row):
    if row['lang']=='th':
        date = row['datetime.birthDate']
        thYear = date.year
        return date.replace(year = thYear-543)
    else:
        return row['datetime.birthDate'] 
    
wikiaDataset['datetime.birthDate'] = wikiaDataset.apply(changeCalendar, axis=1)

In [18]:
print('Wikis with stats AND users AND valid birthdates: {}'.format(len(wikiaDataset)))

Wikis with stats AND users AND valid birthdates: 325347


In [19]:
# Save to CSV
import time
timestr = time.strftime("%Y%m%d")
wikiaDataset.to_csv('data/{}-wikia_stats_users_birthdate.csv'.format(timestr), index=False)