# This notebook extracts information for posts and creates csv files in order to take descriptive statistics for the Wikidata discussions

Use as input the csv file with the posts produced with Raw_data_analysis.ipynb

## 1. Extract username and timestamp for the posts

In the raw Wikidata data, in the end of every posts there is a signature username and a timestamp. The username should follow a structure like "[[User: username | username ]]. However, this is not alwas the case so we create a set of different pattern to detect the usernames. If there is no username signature at the end of the post the code gives an "Anonymous_username_#" name.



In [None]:
import pandas as pd
import re

#extract the name form the string based on the pattern.
#the flag stops the detection of the name if the pattern works
def extract_username(pattern, post,name,flag,loc): 
    names=[]
    if flag==1:
        usernames=pattern.finditer(post)
        #search for the username        
        for match in usernames:
                #print(match.group(2))
                names.append([match.group(loc),match.end()])

        if names!=[]:
            flag=0
            #choose the username based the location. We choose the last found
            loc=0
            for j in names:
                if loc<j[1]:
                    name=j[0]
                    loc=j[1]
        

    return name, flag

#create patterns and iterates to detect the name
def find_usernames(post, subject,anonymous_num ):
    name=[]
    if '[[m:Global message delivery' in post:
        name='Global message'
    elif 'New Wikipedia Library' in subject:
        name='Global message'
    elif 'The Signpost' in subject:
        name='Global message'
    elif '[[m:Special:MyLanguage' in subject:
        name='Global message'
    else:
        #create the patterns to detect the usernames
        pattern_1=re.compile(r"(\[)?(User:|User talk:|user:|user talk:|User Talk:|User_talk:|User_Talk:|user_talk:|Utente:|Usuário:|Utilisateur:|Utilizator:|Usuario discusión:|Usuario:|Usuario Discusi\u00f3n:|kullanıcı:|Kullan\u0131c\u0131:|:USER TALK:|:USER:|U:|u:|Benutzer:|Benutzer Diskussion:|Special:Contributions/|welcominguser=|Discussão:)(.*?)(\||/|\])") # the regular expression I need to extract 
        pattern_2=re.compile(r"(\-)([\w\s\d!\(\)\-\{\};:\'\"\<\>\.\?@#\$%\^&\*_~]+)(\(?talk\)? )?(\d\d:\d\d, \d+ \w+ \d\d\d\d \(UTC\))$")
        pattern_3=re.compile(r"(\-\-)([\w\s\d!\(\)\-\{\};:\'\"\<\>\.\?@#\$%\^&\*_~]+)(\(?talk\)? )?(\d\d:\d\d, \d+ \w+ \d\d\d\d \(UTC\))$")
        pattern_4=re.compile(r"(\.|!|\?|\u3001|\u3002|,)([\w\s\d\(\)\-\{\};:\'\"\<\>@#\$%\^&\*_~]+)(\(?talk\)? )?(, )?(\d\d:\d\d, \d+ \w+ \d\d\d\d \(UTC\))$")

        flag=1# the flag stops the detection if the pattern find a username
        name, flag=extract_username(pattern_1,post,name,flag,3)
        name, flag=extract_username(pattern_2,post,name,flag,2)
        name, flag=extract_username(pattern_3,post,name,flag,2)
        name, flag=extract_username(pattern_4,post,name,flag,2)
        
    #if we could not find a name we asing Anonymous_username_#   
    if name==[] or name==' ':
        anonymous_num += 1
        name='Anonymous_username_'+str(anonymous_num)
   
    return name,anonymous_num


def fill_table(PATH, filename,anonymous_num):
    data = pd.read_csv(str(PATH)+str(filename), header=None, encoding='utf-8')
    data.columns=['post','location', 'thread_subject', 'discussion_type', 'discussion_page_name']# loose the index number
    for row in range(len(data)):
        print(row, len(data))
        post=data.post[row]
        
        data.loc[row,'username'], anonymous_num=find_usernames(post,data.thread_subject[row],anonymous_num)
        if '(UTC)' in post:
            data.loc[row,'timestamp']=re.findall("(\d\d:\d\d, \d+ \w+ \d\d\d\d \(UTC\))$", post)[0]
        else:data.loc[row,'timestamp']='No date'
   
    print(anonymous_num)
    
    #save data to csv
    data.to_csv(str(PATH)+'username_timestamp.csv', encoding='utf-8', index=False,  mode='a', header=False)

In [None]:
#======---------> RUN <--------=============

PATH='FOLDER_WITH_THE_CSV_INPUT/'
filename='filename.csv'

anonymous_num=0 # this is to asign an anonymous name to posts without signatures
fill_table(PATH, filename ,anonymous_num)

## 2. Unique usernames

Take as input the csv file with the information of username and timestamp.

In [None]:

PATH='FOLDER_WITH_THE_CSV_INPUT/'
filename='filename.csv'

list_names=pd.read_csv(str(PATH)+str(filename), header=None, encoding='utf-8')
list_names.columns=['discussion_page_name','fixed_username']
print(len(list_names))

data=list_names['fixed_username']
name_lst=data.to_list()
unique_name_list = list(dict.fromkeys(name_lst))
unique_name_df=pd.DataFrame({'username':unique_name_list})
print( len(data), len(name_lst), len(unique_name_list), len(unique_name_df))

unique_name_df.to_csv(str(PATH)+'unique_usernames.csv', encoding='utf-8', index=False,  mode='a', header=False)


## 3. Extract username information from Wikidata

Extract information like, the number of edits, registration timestamp etc.

First create the input txt file. Then query Wikidata.

In [None]:
import csv

with open('unique_usernames.csv', encoding="utf-8") as f:
    csvreader = csv.reader(f)
    #extracts the header
    #header = next(csvreader)

    file_txt=open('editors_sequence.txt','w', encoding="utf-8")
    
    count_row=0
    for row in csvreader:
        count_row += 1
        #print(row[0])
        file_txt.write(row[0])
        file_txt.write('|')
        print(row[0])
    print(count_row)
    file_txt.close()

In [None]:
#extract the info

import requests
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

S = requests.Session()

URL = "https://www.wikidata.org/w/api.php"#info about the features you can extract https://www.wikidata.org/w/api.php?action=help&modules=query%2Busers


editors_count=0
line_count=0

#path to editors names
with open('/mnt/data/elisavetk/Theme_2/Post_Graph/editors_sequence.txt', encoding="utf-8") as f:
    lines = f.readlines()
    for l in lines:
        
        #editors_in_each_line_count=0
        for part in l.split("|"):
            line_count+=1
            #editors_in_each_line_count+=1
            
            PARAMS = {
            "action": "query",
            "format": "json",
            "list": "users",
            "ususers": part,
            # "usprop": "blockinfo|editcount|registration|rights"} #you choose features to extarct
            "usprop": "editcount|registration|rights"} #you choose features to extarct
            

            
            R = S.get(url=URL, params=PARAMS)
            DATA = R.json()
            
            USERS = DATA["query"]["users"]
            
                        
            users = pd.DataFrame(columns=['userid','name','editcount', 'registration','rights'])#create a data frame with the features you choose
            #unknown_users=pd.DataFrame(columns=['names'])  
            for u in USERS:
                editors_count+=1
                #if ('invalid' not in u.keys()) and ('missing' not in u.keys()):
                    #print(str(u["name"]) + " has " + str(u["editcount"]) + " edits.")
                    #print(str(u["name"]) + ' with id ' + str(u['userid']))
                    #print(u['groupmemberships'])
                users = users.append(u, ignore_index=True)
                #else:unknown_users=unknown_users.append(u, ignore_index=True)
                
            
            #path to save       
            users.to_csv('editors_info.csv', encoding='utf-8', index=False,  mode='a', header=False)
            #unknown_users.to_csv('/mnt/data/elisavetk/Theme_2/Post_Graph/unknown_editor_info.csv', encoding='utf-8', index=False,  mode='a', header=False)
            
            #print(editors_count)

            '''if editors_in_each_line_count == 49:
                break'''
        #break


print('Number of lines')
print(line_count)

print('Number of all imported editors')    
print(editors_count) 