### Web Scraping dataset from Wikipedia

In [None]:
!pip install wikipedia-api

Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


In [1]:
import wikipediaapi
import time
import random

# Function to scrape Wikipedia data for a given topic
def scrape_wiki(topic, num_docs=5000):
    user_agent = "YourAppName/1.0 (your@email.com)"  # Replace with your application name and contact email
    wiki_wiki = wikipediaapi.Wikipedia(user_agent,'en')
    page = wiki_wiki.page(topic)

    if not page.exists():
        print(f"Error: Page for '{topic}' does not exist.")
        return {}

    subtopic_documents = {}

    # Scrape subtopics
    for i in range(min(num_docs, len(page.links))):
        sub_topic = list(page.links.keys())[i]
        sub_page = wiki_wiki.page(sub_topic)

        # Sleep for a short time to avoid hitting Wikipedia too quickly
        time.sleep(random.uniform(0.1, 0.5))

        if sub_page.exists():
            subtopic_documents[sub_topic] = sub_page.text

    return subtopic_documents

# Example usage
topics = ["Political_lists","List_of_political_ideologies","Politics","Politics_of_the_United_States","Politician","Index_of_politics_articles","Political_science","Political_system"]

all_documents = {}
for topic in topics:
    print(f"Scraping documents for {topic}...")
    topic_subtopic_documents = scrape_wiki(topic, num_docs=5000)
    all_documents[topic] = topic_subtopic_documents
    print(f"Scraped {len(topic_subtopic_documents)} subtopics for {topic}")

# Save the scraped documents to files or a database for later use
# Example: Save to text files
for topic, subtopics_and_documents in all_documents.items():
    with open(f"{topic}_documents.txt", 'w', encoding='utf-8') as file:
        for subtopic, document in subtopics_and_documents.items():
            file.write(f"Subtopic: {subtopic}\n\n{document}\n\n{'='*50}\n\n")

print("Scraping completed.")


Scraping documents for Political_lists...
Scraped 306 subtopics for Political_lists
Scraping documents for List_of_political_ideologies...
Scraped 4996 subtopics for List_of_political_ideologies
Scraping documents for Politics...
Scraped 963 subtopics for Politics
Scraping documents for Politics_of_the_United_States...
Scraped 1103 subtopics for Politics_of_the_United_States
Scraping documents for Politician...
Scraped 71 subtopics for Politician
Scraping documents for Index_of_politics_articles...
Scraped 1717 subtopics for Index_of_politics_articles
Scraping documents for Political_science...
Scraped 555 subtopics for Political_science
Scraping documents for Political_system...
Scraped 182 subtopics for Political_system
Scraping completed.


In [2]:
import csv
csv_filename = 'scraped_documents_politics.csv'

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Topic', 'Subtopic', 'Document']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for topic, subtopics_and_documents in all_documents.items():
        for subtopic, document in subtopics_and_documents.items():
            writer.writerow({'Topic': topic, 'Subtopic': subtopic, 'Document': document})

print(f"Scraping completed. Data saved to {csv_filename}.")

Scraping completed. Data saved to scraped_documents_politics.csv.


In [3]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m163.8/265.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [4]:
!pip install transformers[torch]



In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import pandas as pd
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load your CSV dataset
csv_filename = 'scraped_documents_politics.csv'
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,Topic,Subtopic,Document
0,Political_lists,Index of politics articles,Politics is the process by which groups of peo...
1,Political_lists,List of annulled elections,This is a list of political elections that had...
2,Political_lists,List of anti-nuclear protests in the United St...,Anti-nuclear protests in the United States hav...
3,Political_lists,List of basic political science topics,The following outline is provided as an overvi...
4,Political_lists,List of basic public affairs topics,The following outline is provided as an overvi...


# Data Preprocessing

In [5]:
df.shape

(9893, 3)

In [6]:
df['Preprocessed_Document'] = df['Document'].astype(str).fillna('')
dialogues = df.groupby(['Topic', 'Subtopic'])['Preprocessed_Document'].apply(lambda x: ' '.join(x)).reset_index()
dialogues1=  dialogues[:600]
# Save the combined dialogues to a text file
dialogues_filename = 'combined_dialogue_politics.txt'
dialogues1.to_csv(dialogues_filename, header=None, index=None, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')

In [7]:
dialogues_filename = 'combined_dialogue_politics.txt'

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


# Training the DialoGPT model for our dataset

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = GPT2LMHeadModel.from_pretrained("microsoft/DialoGPT-medium")
model.to(device)

# Load the dataset and create a data collator
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dialogues_filename,
    block_size=128,
)

# Use the default DataCollatorForLanguageModeling, which does not require additional parameters
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Fine-tune the model
training_args = TrainingArguments(
    output_dir="./dialoGPT_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_dialoGPT_politics")
tokenizer.save_pretrained("fine_tuned_dialoGPT_politics")


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,3.8862
1000,3.6233
1500,3.5676
2000,3.5182
2500,3.4996
3000,3.4524
3500,3.435
4000,3.199
4500,3.1692
5000,3.1634


('fine_tuned_dialoGPT_politics/tokenizer_config.json',
 'fine_tuned_dialoGPT_politics/special_tokens_map.json',
 'fine_tuned_dialoGPT_politics/vocab.json',
 'fine_tuned_dialoGPT_politics/merges.txt',
 'fine_tuned_dialoGPT_politics/added_tokens.json')

In [1]:
!zip -r './fine_tuned_dialoGPT_politics.zip' './fine_tuned_dialoGPT_politics'

  adding: fine_tuned_dialoGPT_politics/ (stored 0%)
  adding: fine_tuned_dialoGPT_politics/model.safetensors (deflated 7%)
  adding: fine_tuned_dialoGPT_politics/merges.txt (deflated 53%)
  adding: fine_tuned_dialoGPT_politics/tokenizer_config.json (deflated 54%)
  adding: fine_tuned_dialoGPT_politics/config.json (deflated 51%)
  adding: fine_tuned_dialoGPT_politics/generation_config.json (deflated 24%)
  adding: fine_tuned_dialoGPT_politics/vocab.json (deflated 68%)
  adding: fine_tuned_dialoGPT_politics/special_tokens_map.json (deflated 74%)


In [10]:
# from google.colab import files
# files.download("/content/fine_tuned_dialoGPT_politics.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("fine_tuned_dialoGPT_politics")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_dialoGPT_politics")

# Function for chit-chat using the fine-tuned model
def chit_chat(prompt, max_length=100):
    try:
      input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")
      output = fine_tuned_model.generate(input_ids, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)
      response = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
      return response
    except Exception as e:
      print(f"Error: {e}")
      return "I'm sorry, I couldn't understand that."

# Example usage
user_input = "Tell me about politics"
response = chit_chat(user_input)
print(response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Tell me about politics, religion, and the environment.\
The term environmentalism has been used to describe a range of political and social issues. Environmentalism is often used as a synonym for anti-environmentalism, but it can also be used in a broader sense to refer to any political, social, or economic issue.Environmentalism can be applied broadly to a wide variety of issues, including environmental justice, environmental law, public health, international environmental policy, human rights, climate change


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Load your CSV dataset
csv_filename = 'scraped_documents_politics.csv'
df = pd.read_csv(csv_filename)

# Combine subtopic and document to create a text corpus for each topic
#topic_documents = df.groupby(['Topic', 'Subtopic'])['Document'].apply(lambda x: ' '.join(str(x))).reset_index()
#print(topic_documents)
df['Preprocessed_Document'] = df['Document'].astype(str).fillna('')
#topic_documents['Preprocessed_Document'] = topic_documents['Document']
x,y = df.shape
df['Topics'] = ['Politics']*x

# Train a simple classifier for topic analysis
model = make_pipeline(CountVectorizer(), MultinomialNB())
X_train = df['Preprocessed_Document']
y_train = df['Topics']
print(y_train)
model.fit(X_train, y_train)


# Function for topic classification based on the user query
def classify_topic(user_query):
    predicted_topic = model.predict([user_query])[0]
    return predicted_topic

def handle_topic_not_found(predicted_topic):
    if predicted_topic not in df['Topics'].unique():
        return "Please ask another question. I'm not knowledgeable in that area."
    return predicted_topic

# Example usage
user_query = "What are the latest trends in technology?"
predicted_topic = classify_topic(user_query)
result = handle_topic_not_found(predicted_topic)
print(f"The predicted topic for the query is: {predicted_topic}")


0       Politics
1       Politics
2       Politics
3       Politics
4       Politics
          ...   
9888    Politics
9889    Politics
9890    Politics
9891    Politics
9892    Politics
Name: Topics, Length: 9893, dtype: object
The predicted topic for the query is: Politics


In [27]:
# !pip install joblib



In [13]:
import joblib
model_filename = 'politics_model.joblib'
joblib.dump(model, model_filename)

['politics_model.joblib']