In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utility Functions

In [2]:
import re

def clean_text(text):
    """
    Removes HTML tags and reduces multiple consecutive whitespaces to a single space from a given text string.
    
    Args:
    - text: String containing HTML tags and potentially extra whitespaces.
    
    Returns:
    A string with all HTML tags removed and multiple consecutive whitespaces reduced to a single space.
    """
    # Remove HTML tags
    text_without_html = re.sub(r'<.*?>', '', text)
    
    # Reduce multiple consecutive whitespaces (spaces, tabs, newlines) to a single space
    clean_text = re.sub(r'\s+', ' ', text_without_html)
    
    return clean_text.strip()

# Stack Overflow Scrap

## Extracting Questions

In [3]:
import requests

def fetch_python_questions(tag='python', max_pages=1):
    """
    Fetches questions tagged with a specific tag from Stack Overflow.
    
    Args:
    - tag: The tag to filter questions by.
    - max_pages: Maximum number of pages to fetch.
    
    Returns:
    A list of tuples, each containing a question's title and ID.
    """
    questions_list = []
    page = 1
    has_more = True
    
    while has_more and page <= max_pages:
        url = f"https://api.stackexchange.com/2.2/questions"
        params = {
            'page': page,
            'pagesize': 100,
            'order': 'desc',
            'sort': 'activity',
            'tagged': tag,
            'site': 'stackoverflow',
            'filter': 'withbody',
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            for item in data['items']:
                questions_list.append((item['title'], item['question_id']))
                
            has_more = data.get('has_more', False)
            page += 1
        else:
            print(f"Failed to fetch data on page {page}")
            break

    return questions_list



In [4]:
stack_q_ids = []
stack_questions = {}
def main():
    # Fetch questions tagged with 'python'
    python_questions = fetch_python_questions(max_pages=300)  # Adjust `max_pages` as needed
    
    # Print the titles and IDs of the fetched questions
    for title, q_id in python_questions:
#         print(f"Question ID: {q_id}, Question: {title}")
        stack_q_ids.append(q_id)
        stack_questions[q_id] = title
        
main()

Failed to fetch data on page 26


## Extracting Answers

In [5]:
import requests
answers = {}
def fetch_answers_for_questions(question_ids):
    """
    Fetches answers for given question IDs from Stack Overflow.
    
    Args:
    - question_ids: A list of question IDs.
    
    Returns:
    A list of dictionaries, each representing an answer with its body, answer ID, and associated question ID.
    """
    answers_list = []
    for question_id in question_ids:
        url = f"https://api.stackexchange.com/2.2/questions/{question_id}/answers"
        params = {
            'order': 'desc',
            'sort': 'activity',
            'site': 'stackoverflow',
            'filter': 'withbody',
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            for item in data['items']:
                answers_list.append({
                    'answer_id': item['answer_id'],
                    'question_id': question_id,
                    'question' : stack_questions[question_id],
                    'body': clean_text(item['body'][:500])  # Truncate the body for brevity
                })
                
                answers[question_id] = clean_text(item['body'][:500])
        else:
            print(f"Failed to fetch answers for question ID {question_id}")

    return answers_list


In [6]:
answer_list = fetch_answers_for_questions(stack_q_ids)

Failed to fetch answers for question ID 44009452
Failed to fetch answers for question ID 10606133
Failed to fetch answers for question ID 78204656
Failed to fetch answers for question ID 78208271
Failed to fetch answers for question ID 78208171
Failed to fetch answers for question ID 78196689
Failed to fetch answers for question ID 78205863
Failed to fetch answers for question ID 78196349
Failed to fetch answers for question ID 78207704
Failed to fetch answers for question ID 78190020
Failed to fetch answers for question ID 78208262
Failed to fetch answers for question ID 40118869
Failed to fetch answers for question ID 78207796
Failed to fetch answers for question ID 65209035
Failed to fetch answers for question ID 78207971
Failed to fetch answers for question ID 78202446
Failed to fetch answers for question ID 78208153
Failed to fetch answers for question ID 78208230
Failed to fetch answers for question ID 37166320
Failed to fetch answers for question ID 518021
Failed to fetch answer

## Converting to a CSV Table

In [7]:
import csv

# Specify the filename of the CSV file to write to
filename = "stack_questions_answers.csv"

# Open the CSV file for writing
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer object
    csvwriter = csv.writer(csvfile)
    
    # Write the header row
    csvwriter.writerow(['Question ID', 'Question', 'Answer'])
    
    # Iterate over items in the stack_questions dictionary
    for question_id, question in stack_questions.items():
        # Retrieve the corresponding answer using the question_id
        answer = answers.get(question_id, "No answer found")
        
        # Write the question_id, question, and answer to the CSV file
        if answer != "No answer found":
            csvwriter.writerow([question_id, question, answer])

# Reddit Scrap

In [8]:
!pip install praw

Collecting praw
  Downloading praw-7.7.1-py3-none-any.whl.metadata (9.8 kB)
Collecting prawcore<3,>=2.1 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Downloading praw-7.7.1-py3-none-any.whl (191 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.0/191.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Installing collected packages: prawcore, praw
Successfully installed praw-7.7.1 prawcore-2.4.0


In [9]:
# import requests
# import csv
# import re

# # Function to clean text
# def remove_html_tags_and_extra_whitespaces(text):
#     text_without_html = re.sub(r'<.*?>', '', text)
#     clean_text = re.sub(r'\s+', ' ', text_without_html)
#     return clean_text.strip()

# def fetch_submissions(subreddit, keyword, limit=10):
#     url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&q={keyword}&limit={limit}"
#     response = requests.get(url)
#     data = response.json()['data']
#     return [(item['id'], remove_html_tags_and_extra_whitespaces(item['title'])) for item in data]

# def fetch_comments(submission_id):
#     url = f"https://api.pushshift.io/reddit/search/comment/?link_id={submission_id}&limit=1"
#     response = requests.get(url)
#     data = response.json()['data']
#     if data:
#         return remove_html_tags_and_extra_whitespaces(data[0]['body'])
#     return ""

# def save_to_csv(filename, rows):
#     with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
#         csvwriter = csv.writer(csvfile)
#         csvwriter.writerow(['Submission ID', 'Question', 'Answer'])
#         for row in rows:
#             csvwriter.writerow(row)

# # Main process
# subreddit = 'Python'
# keyword = 'python'  # Adjust your keyword for filtering submissions
# submissions = fetch_submissions(subreddit, keyword, limit=10)

# questions_answers = []
# for submission_id, question in submissions:
#     answer = fetch_comments(submission_id)
#     questions_answers.append((submission_id, question, answer))

# save_to_csv('reddit_python_qa.csv', questions_answers)

In [10]:
import praw
import csv
import re

# Function to clean text
def remove_html_tags_and_extra_whitespaces(text):
    text_without_html = re.sub(r'<.*?>', '', text)
    clean_text = re.sub(r'\s+', ' ', text_without_html)
    return clean_text.strip()

# Initialize PRAW with your Reddit app credentials
reddit = praw.Reddit(client_id='-CtESvzyfDvV4vquFTEY_w',
                     client_secret='Zwo9nMNMuK3p4wE42qK_7uG2oOvQyA',
                     user_agent='Key_Condition_7355')

def fetch_questions_answers_from_subreddit(subreddit_name, max_posts=10):
    subreddit = reddit.subreddit(subreddit_name)
    questions_answers = []

    for submission in subreddit.hot(limit=max_posts):
        # Skip posts that are not self-posts (e.g., linked content)
        if not submission.is_self:
            continue
        
        question = remove_html_tags_and_extra_whitespaces(submission.title)
        submission.comments.replace_more(limit=0)  # Load all comments
        for comment in submission.comments.list():
            answer = remove_html_tags_and_extra_whitespaces(comment.body)
            questions_answers.append((question, answer))
            break  # Only take the first comment for simplicity

    return questions_answers

# Save questions and answers to CSV
def save_to_csv(filename, questions_answers):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Question", "Answer"])
        writer.writerows(questions_answers)

# Example usage
subreddit_name = 'python'  # Specify the subreddit you're interested in
questions_answers = fetch_questions_answers_from_subreddit(subreddit_name, max_posts=10000)
save_to_csv("reddit_questions_answers.csv", questions_answers)


In [11]:
pd.read_csv('reddit_questions_answers.csv')

Unnamed: 0,Question,Answer
0,Sunday Daily Thread: What's everyone working o...,I'm writing a library for hand crafted SQL que...
1,Saturday Daily Thread: Resource Request and Sh...,This is a Fakespot Reviews Analysis bot. Fakes...
2,Designing a Pure Python Web Framework,I've been using NiceGUI for a year and it's be...
3,I made a free easy-to-use toast notification l...,"Cool, seems good"
4,Looking for a Object detection Library or Api ...,What about the Segment Anything model?
...,...,...
298,GPTAuthor: open-source CLI tool for writing lo...,Can you use this with a self-hosted LLM?
299,AI with Automation - create Python Microservic...,This is awesome. Thank you for creating this a...
300,"Ten Python datetime pitfalls, and what librari...",At some point we'll need to stop and think wha...
301,Apprise – A lightweight all-in-one notificatio...,Should've named it WUPHF.com


# Python Official Questions

Links for all question categories are as follows:
> ['https://docs.python.org/3/faq/general.html',
'https://docs.python.org/3/faq/programming.html',
'https://docs.python.org/3/faq/design.html',
'https://docs.python.org/3/faq/library.html',
'https://docs.python.org/3/faq/extending.html',
'https://docs.python.org/3/faq/windows.html',
'https://docs.python.org/3/faq/gui.html'
'https://docs.python.org/3/faq/installed.html']

In [12]:
# !pip install requests beautifulsoup4

In [13]:
# import requests
# from bs4 import BeautifulSoup
# import csv

# def fetch_qa_pairs(url):
#     """Fetches questions and answers from a given URL."""
#     response = requests.get(url)
#     soup = BeautifulSoup(response.content, 'html.parser')
#     qas = []
#     questions = soup.select('dl dt')
#     for question in questions:
#         answer = question.find_next_sibling('dd')
#         if answer:
#             # Cleaning and formatting the text
#             question_text = ' '.join(question.get_text(strip=True).split())
#             answer_text = ' '.join(answer.get_text(" ", strip=True).split())
#             qas.append((question_text, answer_text))
#     return qas

# def save_to_csv(filename, data):
#     """Saves questions and answers to a CSV file."""
#     with open(filename, 'w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerow(['Question', 'Answer'])
#         for row in data:
#             writer.writerow(row)

# urls = [
#     'https://docs.python.org/3/faq/general.html',
#     'https://docs.python.org/3/faq/programming.html',
#     'https://docs.python.org/3/faq/design.html',
#     'https://docs.python.org/3/faq/library.html',
#     'https://docs.python.org/3/faq/extending.html',
#     'https://docs.python.org/3/faq/windows.html',
#     'https://docs.python.org/3/faq/gui.html',
#     'https://docs.python.org/3/faq/installed.html'
# ]

# all_qas = []
# for url in urls:
#     qas = fetch_qa_pairs(url)
#     all_qas.extend(qas)

# save_to_csv('python_docs_faq.csv', all_qas)

# print("FAQ extraction completed. Data saved to python_docs_faq.csv.")


In [14]:
# pd.read_csv('python_faq.csv')

In [15]:
# response = requests.get('https://docs.python.org/3/faq/programming.html')
# soup = BeautifulSoup(response.content, 'html.parser')

In [16]:
# soup