In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import pandas as pd
import numpy as np
import requests

## Retrieve Data from API

In [67]:
# Define the tags
tags = {
    'Software Development': ['html', 'css', 'javascript', 'react', 'typescript', 'php', 'react', 'laravel', 'node.js', 'kotlin;java', 'swift', 'sql', 'flask', 'django', 'flutter', 'react-native', 'bootstrap', 'tailwind', 'git'],
    'Data Science and Artificial Intelligence': ['python', 'r', 'sql', 'pandas', 'numpy', 'apache-spark', 'hadoop', 'machine-learning', 'deep-learning', 'tensorflow', 'keras', 'pytorch', 'scikit-learn', 'spacy', 'nltk', 'matplotlib', 'seaborn'],
    'Internet of Things': ["iot", "aws-iot", "azure-iot-hub", "google-cloud-iot", "ibm-watson-iot", "node-red", "kafka", "mqtt", "coap", "lora", "lorawan", "zigbee", "rfid", "raspberry-pi", "arduino", "esp8266", "esp32", "microcontroller", "raspbian", "grafana", 'embedded']
}

reversed_tag = {}
for k, v in tags.items():
    for item in v:
        reversed_tag[item] = k

In [None]:
# Create a function to fetch data from API
def fetch_data(params):
    url = 'https://api.stackexchange.com/2.3/questions'
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
        else:
            data = {}
            response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}") 
    except requests.exceptions.RequestException as e:
        print(f"Other error occurred: {e}")  
        
    
    return data

In [None]:
# Fetching data from API
data = []
for i in tags.keys():
    for j in tags[i]:
        for k in [1,2]:
            data.append([fetch_data(
                params= {
                    'page': k,
                    'pagesize': 100 if int(3000/len(tags[i])) > 100 else int(3000/len(tags[i])),
                    'order': 'desc',
                    'sort': 'votes',
                    'tagged': j,
                    'site': 'stackoverflow'
                }), i]
            )

KeyboardInterrupt: 

In [None]:
# Convert retrieved data (JSON) to Python list
df = []
for i in data:
    for j in i[0]['items']:
        df.append([j['title'], j['tags'], i[1]])

In [None]:
# Convert Python list to dataframe
df = pd.DataFrame(df, columns=['question', 'tags', 'label'])

In [None]:
df

Unnamed: 0,question,tags,label
0,Why does HTML think “chucknorris” is a color?,"[html, browser, background-color]",Software Development
1,How can I validate an email address in JavaScr...,"[javascript, html, regex, email-validation]",Software Development
2,How do I check whether a checkbox is checked i...,"[javascript, jquery, html, checkbox]",Software Development
3,How can I horizontally center an element?,"[html, css, alignment, centering]",Software Development
4,Which &quot;href&quot; value should I use for ...,"[javascript, html, performance, optimization, ...",Software Development
...,...,...,...
10908,Are there any web frameworks for compiled lang...,"[c++, frameworks, embedded]",Internet of Things
10909,How to access new &#39;in-cell-image&#39; from...,"[image, google-apps-script, google-sheets, emb...",Internet of Things
10910,"PWM pin of microcontroller, what is it for?","[embedded, microcontroller]",Internet of Things
10911,What are some refactoring methods to reduce si...,"[c, optimization, memory, embedded, size]",Internet of Things


In [71]:
# Save the data (dataframe) to excel
df.to_excel('question.xlsx', index=False)