# Process the raw discussion data in Wikidata

## 1. Extract the raw discussion

Use [this process](https://github.com/King-s-Knowledge-Graph-Lab/Wikidata-Discussion-Parser) to create csv files with revisions on Wikidata discussion pages. The process creates one csv file for every discussion page. The rows in the files represent the revision of the discussion page.

Use the csv files as input to the below process to get:
1. the full raw discussion for every file (function import_csv)
2. the different discussions on the page (Wikidata discussion pages may include more than one discussion threads, they are separated based on a subject title) (function separate_discussions)
3. the different posts for every discussion on the page (function sep_posts)


The output of the below process is a json file for every input discussion page, with the different discussions and the different posts.
Every element represents a thread. In the thread level, "subject" has the title of the discussion (if no title in the discussion it returns 'No subject'), and "thread" has a list of the posts in this thread.

If there are no threads in the discussion page (some discussion pages include only meta data information like description tables) the process returns empty json files.



In [None]:
import csv
import sys
import codecs
import os
import re
from difflib import SequenceMatcher
import itertools
import json

#this is to extent the size of the reading csv cell
csv.field_size_limit(sys.maxsize)

ENCODING = "utf-8"

#this function reads the csv and creates a list with the rows
def import_csv(csvfilename):
    data = []
    row_index=0
    
    with open(csvfilename, "r", encoding="utf-8", errors="ignore", header=None) as scraped:
        reader = csv.reader(scraped, delimiter=',')
        if reader is not None:
            for row in reader:
                if row:  # avoid blank lines
                    row_index += 1
                    columns = [row[7]]
                    data.append(columns)
    
    return data

#to separate threads with no title into posts
def no_title(temp, all):
    if (temp[0][0:2]=='{{') and (('documentation' in temp[0]) or ('Documentation' in temp[0])):
        a=1 
    else:
        if ('#REDIRECT' not in temp[0]):
            disc_dict={}
            if '(UTC)' in temp[0]:
                posts_lst=[]
                disc_dict['subject']='No subject'
                posts=re.split(r'(\d\d:\d\d, \d+ \w+ \d\d\d\d \(UTC\))',temp[0])
                for i in range(0,len(posts)-1, 2):
                    posts_lst.append(posts[i]+posts[i+1])
    
                disc_dict['thread']=posts_lst
                all.append(disc_dict) 
    return all


#to separate threads with titles into posts
def sep_posts(titles, temp, all):
    for j in range(len(temp)):
        if (temp[j] in titles):
            disc_dict={}
            if (j+1)==(len(temp)):
                disc_dict['subject']=temp[j]
                disc_dict['thread']=""
                all.append(disc_dict)
            elif '(UTC)' in temp[j+1]:
                posts_lst=[]
                disc_dict['subject']=temp[j]
                posts=re.split(r'(\d\d:\d\d, \d+ \w+ \d\d\d\d \(UTC\))',temp[j+1])
                for i in range(0,len(posts)-1, 2):
                    posts_lst.append(posts[i]+posts[i+1])
    
                disc_dict['thread']=posts_lst
                all.append(disc_dict)
    return all
            

#function to create a file with separate threads and separate posts for the discussion page
def separate_discussions(file_name, new_file_path, name):

    data = import_csv(file_name)#call the csv data
    last_row = data[-1]#take the last row including all threads in the discussion page

    titles = re.findall('==(.*)==', last_row[0])#find the titles in the discussion  
    titles=[s.replace('=', '') for s in titles]
    
    #titles=list(set(titles)) 
    temp= last_row[0].split('==')
    temp = [x for x in temp if x != '']
    temp=[s.strip('=') for s in temp]
    
    all=[]
    if titles==[]:
        all=no_title(temp, all) 
    else:
        all=no_title(temp, all)
        all=sep_posts(titles, temp, all)

    with open(str(new_file_path)+str(name[:-4])+'.json', "w") as outfile:
        json_object = json.dumps(all,indent=4)                
        outfile.write(json_object)

In [None]:
new_file_path='JSON_FILE/'
initial_folder_path='CSV_FILE/'
filenames_folder='LIST_OF_CSV_FILES_TO_PROCESS/'


#=======================================================================
#========= ------->>>> R U N <<<<-------- ==============================

#create a folder to save the new csv files with the edges (two columns with usernames that talk in the same talk pages)            
# if not os.path.exists(new_folder_name):
#         os.mkdir(new_folder_name)         

#read the name of the csv files in the TP_csv_ folder  
file1 = open(filenames_folder, 'r') 
Lines = file1.readlines() 



#call the csv files from the TP_csv_ folder
for i in Lines:
    #extarct the space before and after the string name
    name="".join(i.split())
    print("start: " +str(name))
    #check if the csv is empty
    if os.stat(str(initial_folder_path)+str(name)).st_size == 0:
        continue
    #call the function to find the usernames and create the edges
    list_usernames=separate_discussions(str(initial_folder_path)+str(name),new_file_path , name)
    print("finish: " +str(name))

## 2. Extract information for the posts in the Wikidata discussions

Use the json files produced in the above process to process the posts in the thread. 

The below code creates a csv with information about the posts. Every row is a posts and the columns include:
1. the post
2. the place of the post ij the thread (e.g. 1 if it is the first post, 2 if it is the second post etc.) 
3. the subject title of the thread includes the post
4. the name of discussion page includes the post 

In [None]:
import json
import pandas as pd
import codecs

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

def extract_info(filename,name,data):
    
    #read the json with the threads
    f=codecs.open(filename,encoding='utf-8', mode='r')
    jsonObject=json.load(f)
    #read the thread to read the posts
    for thread in range(len(jsonObject)):
        location=0
        if isinstance(jsonObject[thread]['thread'], str):
            location +=1 
            row={'post':jsonObject[thread]['thread'],'location':location, 'thread_subject':jsonObject[thread]['subject'], 'discussion_page_name':name}
            data=data.append(row, ignore_index=True)
        else:
            for post in jsonObject[thread]['thread']:
                # print(repr(post))
                location +=1 
                row={'post':post,'location':location, 'thread_subject':jsonObject[thread]['subject'], 'discussion_page_name':name}
                data=data.append(row, ignore_index=True)

    return data
    
    
def posts2csv(PATH_input,PATH_output, filenames):

    data = pd.DataFrame(columns=['post','location', 'thread_subject', 'discussion_page_name'])#create a df to save the info

    #read the filenames  
    file=open(str(PATH_input)+str(filenames), 'r')
    lines=file.readlines()
    for line in lines:
        name="".join(line.split())#remove the space before and after the string name
        print("start: " +str(name))
        data=extract_info(str(PATH_input)+str(name),name,data)
        print("finish: " +str(name))
    file.close()

    #save data to csv
    data.to_csv(str(PATH_output)+'posts.csv', encoding='utf-8', index=False,  mode='a', header=False)



In [None]:
#<-------RUN------------->
PATH_input='SAVED_JSON_FILES_FOLDER/'
PATH_output='FOLDER_TO_SAVE_THE_POST_CSV/'

#filenames.txt is a files includes the json names, every row is a filename

posts2csv(PATH_input,PATH_output, 'filenames.txt')