In [8]:
### Import packages for authentication

from google.cloud import bigquery
from google.oauth2 import service_account

### Import packages for converting query results into dataframe

import pandas as pd 

## Import packages to create absolute file path &  make code independent of operating system

from pathlib import Path
import os.path


In [10]:
### Authentication 

base_path = Path("__file__").parent
full_path = (base_path / "../data/raw/GoogleBigQuery_key.json").resolve()

credentials = service_account.Credentials.from_service_account_file(os.path.join(full_path))

## "/Users/HenriekeMax/Documents/Career_Development/GitHub/FrauenLoop_NLP_Project_2020/data/raw/GoogleBigQuery_key.json"

In [11]:
### Construct a BigQuery client object.

client = bigquery.Client(credentials=credentials, 
project = credentials.project_id)

In [12]:
### Getting overview of Stack Overflow tables

stackoverflow = client.dataset('stackoverflow', project= 'bigquery-public-data')
print([x.table_id for x in client.list_tables(stackoverflow)])

['badges', 'comments', 'post_history', 'post_links', 'posts_answers', 'posts_moderator_nomination', 'posts_orphaned_tag_wiki', 'posts_privilege_wiki', 'posts_questions', 'posts_tag_wiki', 'posts_tag_wiki_excerpt', 'posts_wiki_placeholder', 'stackoverflow_posts', 'tags', 'users', 'votes']


In [14]:
### Make an API request

query = """
SELECT
      pq.id as question_id, pa.parent_id as question_id_check, pq.title as question_title, pq.body as question_text,        pq.accepted_answer_id, pq.answer_count, pq.comment_count, pq.community_owned_date,
      pq.creation_date, pq.favorite_count, pq.last_activity_date, pq.last_edit_date, pq.last_editor_display_name, 
      pq.last_editor_user_id, pq.owner_display_name, pq.owner_user_id, pq.post_type_id, pq.score,
      pq.tags, pq.view_count,
      pa.id as answer_id, pa.body as answer_text
FROM `bigquery-public-data.stackoverflow.posts_questions` pq
INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` pa ON pq.id = pa.parent_id
WHERE pa.creation_date > "2019-05-30 00:00:00.000 UTC"
"""

dataframe = (
    client.query(query)
    .result()
    .to_dataframe()
)

In [15]:
### Display query results

display(dataframe)

Unnamed: 0,question_id,question_id_check,question_title,question_text,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,...,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,post_type_id,score,tags,view_count,answer_id,answer_text
0,1859864,1859864,How to create an integer array in Python?,"<p>It should not be so hard. I mean in C, </p>...",1859889.0,8,3,NaT,2009-12-07 13:08:31.840000+00:00,11.0,...,,,,226353.0,1,33,python|arrays,244103,56431492,<pre><code>import numpy as np\n\nnew_array=np....
1,59920382,59920382,How can I add rows with same string value in s...,<p>i have tried to make this df to the on afte...,59920693.0,2,3,NaT,2020-01-26 16:32:52.357000+00:00,,...,,10035985.0,,12786329.0,1,0,python|pandas,32,59920693,<p>you need <code>reset_index()</code> along w...
2,60142014,60142014,Custom Sort Using a Comma Separated String Bas...,<p>This is an extension to my original questio...,60142496.0,1,3,NaT,2020-02-09 22:37:36.787000+00:00,,...,,10018602.0,,10018602.0,1,1,excel|vba,36,60142496,<p>I couldn't get the <code>Worksheet.Sort</co...
3,57150458,57150458,Can a Retry be configured to retry on null?,<p>I am just starting out with resilience4j an...,57151117.0,1,2,NaT,2019-07-22 16:48:52.310000+00:00,,...,,,,1997707.0,1,1,java|resilience4j,77,57151117,<p>Create a custom <code>RetryConfig</code> wi...
4,57066427,57066427,Sum multiple columns that have specific name i...,<p>I would like to sum the values of <code>Var...,57067383.0,4,2,NaT,2019-07-16 22:52:19.370000+00:00,,...,,89482.0,,11725370.0,1,2,r|sum,92,57067383,<p>Another <code>dplyr</code> way is to use he...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789025,57726645,57726645,convert text file to html ouput so that data c...,<p>I have a text file which has 3 values separ...,57727047.0,1,2,NaT,2019-08-30 12:20:10.240000+00:00,,...,,10248678.0,,5343483.0,1,0,html|linux|shell,28,57727047,"<p>If you can, please consider <code>awk</code..."
1789026,58020686,58020686,Unable to use date variable,"<p>I'm still quite new with PowerShell, and wh...",,1,2,NaT,2019-09-20 01:34:35.320000+00:00,,...,,,,12093048.0,1,0,powershell|date|get-winevent,38,58022281,<p>Still not sure why using the variable $Star...
1789027,58003718,58003718,How to declare a variable in postgres script?,<p>I am fairly new to Postgres and I cannot be...,58006375.0,2,2,NaT,2019-09-19 04:26:39.457000+00:00,,...,,,,2216975.0,1,0,postgresql,46,58006375,<p>You are confused on several levels.</p>\n\n...
1789028,58031827,58031827,how to show warning\error for int to long value,<p>Maybe somebody will help me or point to the...,,2,2,NaT,2019-09-20 16:04:31.753000+00:00,,...,,1102149.0,,1102149.0,1,0,c#|.net,68,58032175,<p>If you write an overload that takes in an <...


In [16]:
### Save dataframe to a csv file

base_path = Path("__file__").parent
full_path = (base_path / "../data/raw/stackoverflow_raw.csv").resolve()

dataframe.to_csv(os.path.join(full_path))

In [None]:
### some code here about experimenting with the retry-parameter in case rate limit is exceeded
### did not use this, just found and wanted to keep for now in case it was needed

from google.cloud.biquery import DEFAULT_RETRY

query_deadline = 5 * 60  # seconds
query_retry = DEFAULT_RETRY.with_deadline(query_deadline)
job = client.query(query, retry=query_retry)