In [7]:
import json
import pandas as pd

In [8]:
file_paths = [
    '20231012_230826_commit_sharings.json',
    '20231012_232232_hn_sharings.json',
    '20231012_233628_pr_sharings.json',
    '20231012_234250_file_sharings.json',
    '20231012_235128_issue_sharings.json',
    '20231012_235320_discussion_sharings.json'
]

all_data = [] 

for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)

        # first get embedded data from Sources column
        for source in data.get('Sources', []):
            source_type = source.get('Type', '')
            source_id = source.get('ID', '')
            source_url = source.get('URL', '')
            author = source.get('Author', '')
            title = source.get('Title', '')
            created_at = source.get('CreatedAt', '')

            # then get embedded data from ChatgptSharing column
            for chatgpt_sharing in source.get('ChatgptSharing', []):
                chatgpt_url = chatgpt_sharing.get('URL', '')
                status = chatgpt_sharing.get('Status', '')
                date_of_conversation = chatgpt_sharing.get('DateOfConversation', '')
                date_of_access = chatgpt_sharing.get('DateOfAccess', '')
                model_used = chatgpt_sharing.get('Model', '')
                num_prompts = chatgpt_sharing.get('NumberOfPrompts', 0)
                tokens_of_prompts = chatgpt_sharing.get('TokensOfPrompts', 0)
                tokens_of_answers = chatgpt_sharing.get('TokensOfAnswers', 0)

                # lastly, exract the conversation information
                for conversation in chatgpt_sharing.get('Conversations', []):
                    entry = {
                        "SourceType": source_type,
                        "SourceID": source_id,
                        "SourceURL": source_url,
                        "Author": author,
                        "Title": title,
                        "CreatedAt": created_at,
                        "ChatGPT_URL": chatgpt_url,
                        "Status": status,
                        "DateOfConversation": date_of_conversation,
                        "DateOfAccess": date_of_access,
                        "ModelUsed": model_used,
                        "NumPrompts": num_prompts,
                        "TokensOfPrompts": tokens_of_prompts,
                        "TokensOfAnswers": tokens_of_answers,
                        "Prompt": conversation.get("Prompt", ""),
                        "Answer": conversation.get("Answer", ""),
                        "ListOfCode": conversation.get("ListOfCode", [])
                    }
                    all_data.append(entry)


In [9]:
df = pd.DataFrame(all_data)

# saving to csv so process does not need to be repeated
df.to_csv('extracted_data.csv', index=False)

In [10]:
df.head()

Unnamed: 0,SourceType,SourceID,SourceURL,Author,Title,CreatedAt,ChatGPT_URL,Status,DateOfConversation,DateOfAccess,ModelUsed,NumPrompts,TokensOfPrompts,TokensOfAnswers,Prompt,Answer,ListOfCode
0,commit,,https://github.com/grnpin/textbox/commit/fa335...,grnpin,,,https://chat.openai.com/share/4bad57dd-9636-4b...,200,"September 20, 2023",2023-10-12 23:53:27.348018,Default (GPT-3.5),1,330,556,このコードだと、スマートフォンでキーボードを表示したときにbuttonがキーボードの下に隠れ...,ChatGPTキーボードが表示されたときにボタンをキーボードの上に移動させるには、CSSでキ...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
1,commit,,https://github.com/grnpin/textbox/commit/0dfc4...,grnpin,,,https://chat.openai.com/share/1fd7ffc2-9264-45...,200,"September 9, 2023",2023-10-12 23:53:27.833077,Default,1,51,160,このコードなんですが、Placeholder上でちゃんと改行するコードにできますか。\n\n...,ChatGPTはい、改行を含むPlaceholder文字列を設定するコードを作成することがで...,[]
2,commit,,https://github.com/grnpin/textbox/commit/811d0...,grnpin,,,https://chat.openai.com/share/99d2d02f-7e3b-43...,200,"September 7, 2023",2023-10-12 23:53:28.345040,Default,1,916,115,refactor this code.\n\n<!DOCTYPE html>\n<html>...,ChatGPTHere's a refactored version of your HTM...,[]
3,commit,,https://github.com/bh679/MindfulAI2.0/commit/e...,bh679,,,https://chat.openai.com/share/90ebe55e-bd60-47...,200,"August 8, 2023",2023-10-12 23:53:29.830292,Advanced Data Analysis,6,5774,1898,server.js\n// Required libraries\nimport cors ...,ChatGPTYou've shared multiple pieces of server...,[]
4,commit,,https://github.com/bh679/MindfulAI2.0/commit/e...,bh679,,,https://chat.openai.com/share/90ebe55e-bd60-47...,200,"August 8, 2023",2023-10-12 23:53:29.830292,Advanced Data Analysis,6,5774,1898,"package.json\n{\n ""name"": ""mindfulai"",\n ""ve...",ChatGPTIt looks like you're having issues with...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."


In [11]:
df.tail()

Unnamed: 0,SourceType,SourceID,SourceURL,Author,Title,CreatedAt,ChatGPT_URL,Status,DateOfConversation,DateOfAccess,ModelUsed,NumPrompts,TokensOfPrompts,TokensOfAnswers,Prompt,Answer,ListOfCode
27088,discussion,,https://github.com/ZoeLeBlanc/is578-intro-dh/d...,ZoeLeBlanc,AI & CLI Assignment,2023-08-31T16:33:05Z,https://chat.openai.com/share/cfe2a21a-29e0-47...,200,"September 8, 2023",2023-10-12 23:53:17.501679,Default (GPT-3.5),6,156,1830,received this error code when verifying the st...,ChatGPTI apologize for the confusion. It seems...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
27089,discussion,,https://github.com/ZoeLeBlanc/is578-intro-dh/d...,ZoeLeBlanc,AI & CLI Assignment,2023-08-31T16:33:05Z,https://chat.openai.com/share/cfe2a21a-29e0-47...,200,"September 8, 2023",2023-10-12 23:53:17.501679,Default (GPT-3.5),6,156,1830,now create a how to use the command line to cr...,ChatGPTTo create a text file with the content ...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
27090,discussion,,https://github.com/ZoeLeBlanc/is578-intro-dh/d...,ZoeLeBlanc,AI & CLI Assignment,2023-08-31T16:33:05Z,https://chat.openai.com/share/cfe2a21a-29e0-47...,200,"September 8, 2023",2023-10-12 23:53:17.501679,Default (GPT-3.5),6,156,1830,the echo command is not working,ChatGPTI apologize for the confusion. If the e...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
27091,discussion,,https://github.com/ZoeLeBlanc/is578-intro-dh/d...,ZoeLeBlanc,AI & CLI Assignment,2023-08-31T16:33:05Z,https://chat.openai.com/share/cfe2a21a-29e0-47...,200,"September 8, 2023",2023-10-12 23:53:17.501679,Default (GPT-3.5),6,156,1830,the echo command is still not working on windo...,ChatGPTI apologize for the inconvenience. If t...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
27092,discussion,,https://github.com/orgs/netket/discussions/1482,MrAAafa99,Exploring the Plotting of Pair Correlation Fun...,2023-05-25T15:11:27Z,https://chat.openai.com/share/caaa9e43-860a-46...,200,"August 16, 2023",2023-10-12 23:53:19.322053,Default (GPT-3.5),2,36,616,how do I assign error bars to the entries of a...,ChatGPTAssigning error bars to histogram entri...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27093 entries, 0 to 27092
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   SourceType          27093 non-null  object
 1   SourceID            27093 non-null  object
 2   SourceURL           27093 non-null  object
 3   Author              27093 non-null  object
 4   Title               27093 non-null  object
 5   CreatedAt           27093 non-null  object
 6   ChatGPT_URL         27093 non-null  object
 7   Status              27093 non-null  int64 
 8   DateOfConversation  27093 non-null  object
 9   DateOfAccess        27093 non-null  object
 10  ModelUsed           27093 non-null  object
 11  NumPrompts          27093 non-null  int64 
 12  TokensOfPrompts     27093 non-null  int64 
 13  TokensOfAnswers     27093 non-null  int64 
 14  Prompt              27093 non-null  object
 15  Answer              27093 non-null  object
 16  ListOfCode          27