In [29]:
import mlflow
import mlflow.sklearn
from time import time
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             hamming_loss, jaccard_score, confusion_matrix, roc_curve, auc)
import matplotlib.pyplot as plt
import numpy as np


import requests
import time
import csv
import pandas as pd
from datetime import datetime, timedelta

## Récupération des données via l'API

In [32]:

# Fonction pour récupérer les questions avec filtrage par dates (fromdate, todate)
def fetch_questions(page, pagesize, key, fromdate, todate, retries=3):
    url = "https://api.stackexchange.com/2.3/questions"
    params = {
        "page": page,
        "pagesize": pagesize,
        "order": "desc",
        "sort": "votes",
        "site": "stackoverflow",
        "filter": "withbody",  # Ajouter le filtre pour récupérer le body
        "key": key,  # Inclure la clé API
        "fromdate": int(fromdate.timestamp()),  # Convertir en timestamp Unix
        "todate": int(todate.timestamp())  # Convertir en timestamp Unix
    }

    print(f"Sending request for page {page} with params: {params}")

    for attempt in range(retries):
        try:
            response = requests.get(url, params=params)
            # Vérifier si le statut est 429 Too Many Requests
            if response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', 60))  # Par défaut attendre 60s si "Retry-After" absent
                print(f"Too many requests. Retrying after {retry_after} seconds.")
                time.sleep(retry_after)
                return fetch_questions(page, pagesize, key, fromdate, todate, retries)
            response.raise_for_status()  # Raises HTTPError for bad responses
            return response.json()

        except requests.exceptions.HTTPError as err:
            if response.status_code == 400:  # Gérer les erreurs 400
                print(f"HTTP error 400 occurred: {err}. Retrying in 300 seconds...")
                time.sleep(300)  # Pause de 300 secondes avant de réessayer
            else:
                print(f"HTTP error occurred: {err}")
                return None  # Autres erreurs, abandonner cette requête
    print(f"Failed after {retries} attempts for page {page}. Skipping to the next page.")
    return None  # Retourner None après avoir épuisé les tentatives

# Fonction pour filtrer les questions selon les critères spécifiques
def filter_questions(questions):
    filtered = []
    
    # Afficher la réponse brute pour vérifier combien de questions sont récupérées
    print(f"Total questions fetched (before filtering): {len(questions)}")
    
    for idx, question in enumerate(questions):
        # Appliquer les filtres donnés dans l'image:
        # Pour le diagnostic, je désactive temporairement certains filtres
        if (
            question['view_count'] > 0 and
            # 'favorite_count' in question and question['favorite_count'] is not None and  # Filtre désactivé pour test
            question['answer_count'] > 0 and
            len(question['tags']) >= 5 and  # Vérifie qu'il y a au moins 5 tags
            'title' in question and 'body' in question  # Assure qu'il y a un titre et un body
        ):

            # Convertir le timestamp creation_date en date lisible
            creation_date = datetime.utcfromtimestamp(question['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
            
            filtered.append({
                "title": question['title'],
                "body": question['body'],
                "link": question['link'],
                "view_count": question['view_count'],
                "score": question['score'],
                # "favorite_count": question.get('favorite_count', 0),  # Filtre désactivé pour test
                "answer_count": question['answer_count'],  # Ajout du nombre de réponses
                "tags": ','.join(question['tags']),
                "creation_date": creation_date  # Ajouter la date de création
            })
    
    # Afficher combien de questions restent après filtrage
    print(f"Total questions after filtering: {len(filtered)}")
    
    return filtered

# Fonction principale pour récupérer les questions semaine par semaine
def main():
    all_questions = []
    total_questions = 0  # Compteur global pour le nombre total de questions récupérées
    pagesize = 100  # Taille des pages
    max_questions = 50000
    api_key = "rl_Vssw9iwuVvjuT8agzZnVhH9J3"  # Replace with your actual API key

    # Calculer la plage de temps pour l'année n-1, semaine par semaine
    today = datetime.today()
    year_n_1 = today.year - 1  # Calculer l'année n-1

    # Commencer au début de l'année n-1
    start_date = datetime(year_n_1, 1, 1)
    end_date = datetime(year_n_1, 12, 31)

    current_date = start_date

    while current_date <= end_date:
        # Calculer la fin de la semaine
        next_week = current_date + timedelta(days=7)
        last_day_of_week = min(next_week, end_date)
        
        page = 1
        while True:
            data = fetch_questions(page, pagesize, api_key, current_date, last_day_of_week)
            if data is None:
                # Si une erreur survient après plusieurs tentatives sur une page, passer à la page suivante
                print(f"Skipping page {page} for the week starting {current_date.strftime('%d %B %Y')} due to repeated errors.")
                page += 1
                continue  # Passer à la page suivante si une page échoue après plusieurs tentatives

            if 'items' not in data or len(data['items']) == 0:
                print("No more questions to fetch.")
                break

            # Appliquer le filtrage avec les critères
            filtered_questions = filter_questions(data['items'])
            all_questions.extend(filtered_questions)

            # Mettre à jour le compteur global
            total_questions += len(filtered_questions)
            print(f"Fetched {len(filtered_questions)} questions on page {page} for {current_date.strftime('%d %B %Y')}.")
            print(f"Total questions collected so far: \033[1m{total_questions}\033[0m")

            if len(all_questions) >= max_questions:
                print("Reached the maximum number of questions to fetch.")
                break

            # Vérifier si l'API a encore des pages à retourner
            if not data.get('has_more', False):
                print("No more pages to fetch.")
                break

            # Respect API rate limits with dynamic sleep
            retry_after = int(data.get('backoff', 1))
            if retry_after > 1:
                print(f"Pausing for {retry_after} seconds to respect rate limits.")
            time.sleep(retry_after)

            page += 1  # Passer à la page suivante

        if len(all_questions) >= max_questions:
            break

        # Passer à la semaine suivante
        current_date = next_week

    # Save the questions to a CSV file
    csv_file = 'filtered_questions_year_n_1.csv'
    csv_columns = ["title", "body", "link", "view_count", "score", "answer_count", "tags", "creation_date"]
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for question in all_questions:
            writer.writerow(question)

    print(f"Collected {total_questions} questions and saved to {csv_file}")

if __name__ == "__main__":
    main()



Sending request for page 1 with params: {'page': 1, 'pagesize': 100, 'order': 'desc', 'sort': 'votes', 'site': 'stackoverflow', 'filter': 'withbody', 'key': 'rl_Vssw9iwuVvjuT8agzZnVhH9J3', 'fromdate': 1672527600, 'todate': 1673132400}
Total questions fetched (before filtering): 100
Total questions after filtering: 17
Fetched 17 questions on page 1 for 01 January 2023.
Total questions collected so far: [1m17[0m
Sending request for page 2 with params: {'page': 2, 'pagesize': 100, 'order': 'desc', 'sort': 'votes', 'site': 'stackoverflow', 'filter': 'withbody', 'key': 'rl_Vssw9iwuVvjuT8agzZnVhH9J3', 'fromdate': 1672527600, 'todate': 1673132400}
Total questions fetched (before filtering): 100
Total questions after filtering: 16
Fetched 16 questions on page 2 for 01 January 2023.
Total questions collected so far: [1m33[0m
Pausing for 10 seconds to respect rate limits.
Sending request for page 3 with params: {'page': 3, 'pagesize': 100, 'order': 'desc', 'sort': 'votes', 'site': 'stackover

client id : 29476  
client secret : f*qvwypgwfHuuUzVJ4abow((  
key : rl_Vssw9iwuVvjuT8agzZnVhH9J3  
code : bOaC6F5f5Ct57Bcd7j0gPA))  