### Imports & Config

In [21]:
import wikipedia
import pysolr
import os
import numpy as np
import pandas as pd
import requests
import re

In [5]:
search_topics = ['Health', 'Environment', 'Technology', 'Economy', 'Entertainment',
                 'Sports', 'Politics', 'Education', 'Travel', 'Food']

core create cmd : 
core delete cmd : sudo -u solr ./solr-9.0.0/bin/solr create -c "wikipedia"

help(admin.create)

admin = pysolr.SolrCoreAdmin(url+'admin/cores')
admin.create_with_configset(
    core_name,
    config= f'{url}/{core_name}/conf',
    schema= f'{url}/{core_name}/schema',
    dataDir= f'{url}/{core_name}/data',
)

In [3]:
topic = 'Health'
min_documents = 510
max_short_summaries = min_documents*0.05
core_name = 'wikipedia'
url = 'http://35.188.149.137:8983/solr/'

### Scraping

In [4]:
def check_page_validity(page,csr):
    page_content = wikipedia.page(page, auto_suggest=False,redirect=True, preload=False)
    page_summary = page_content.summary

    if(len(page_summary) >= 200):
        return 2
    if(len(page_summary) < 200 and csr <=  max_short_summaries):
        return 1
    return 0

In [5]:
def scrape_helper(page_titles, topic):
    topic_docs_json = []
    
    for page_title in page_titles:
        page_content = wikipedia.page(page_title, auto_suggest=False,redirect=True, preload=False)    
        topic_docs_json.append(
            {
            'revision_id':page_content.revision_id,
            'title': page_content.title,
            'summary':page_content.summary,
            'url': page_content.url,
            'topic':topic
            }
        )
    return topic_docs_json

In [6]:
def scrape_topic_wikipedia(topic,min_documents):
    subtopic_page_titles = set()
    count_short_sumaries = 0
    
    
    subtopic_searches = wikipedia.search(topic,min_documents)  
    for subtopic in subtopic_searches:
        try:
            subtopic_content = wikipedia.page(subtopic, auto_suggest=False,redirect=True, preload=False)
            subtopic_page_links = subtopic_content.links
            if(len(subtopic_page_titles) >= min_documents):
                    break
            for page in subtopic_page_links:
                if(len(subtopic_page_titles) >= min_documents):
                    break
                val = check_page_validity(page,count_short_sumaries)
                if(val > 0):
                    subtopic_page_titles.add(page)
                    if(val == 1):
                        count_short_sumaries += 1
                l = len(subtopic_page_titles)
                if(l % 20 == 0):
                    print("subtopics finding completed : ",round(100*l/min_documents,2),"%")
        except Exception as e:
            #print(e)
            1+1
            
    print("scraping individual pages..")
    page_titles = list(subtopic_page_titles)[:min_documents]
    scraped_data = scrape_helper(page_titles,topic)
    
    return scraped_data

In [None]:
scrape_topic_wikipedia(topic,10)

In [None]:
for topic in search_topics:
    scrape_topic_wikipedia(topic,10)

### Preprocessing

In [7]:
def preprocess_summary(df,column):
    df[column] = df[column].replace(r"[^\w\s]", "", regex=True)
    return df

### Indexing

In [8]:
def get_schema():
    schema = {
            "add-field": [
                {
                    "name": "revision_id",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                },
                {
                    "name": "title",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                },
                {
                    "name": "summary",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                },
                {
                    "name": "url",
                    "type": "string",
                    "indexed": False,
                    "multiValued": False
                },
                {
                    "name": "topic",
                    "type": "string",
                    "indexed": True,
                    "multiValued": False
                },
            ]
        }
    
    return schema

In [9]:
def delete_core(core_name):
    print(os.system('sudo -u solr ./solr-9.0.0/bin/solr create -c {core}'.format(core=core_name)))


def create_core(core_name):
    print(os.system('sudo -u solr ./solr-9.0.0/bin/solr create -c {core} -n data_driven_schema_configs"'.format(
            core=core_name)))

In [24]:
def set_config():
    delete_core(core_name)
    create_core(core_name)

    solr = pysolr.Solr(url+core_name, always_commit=True, timeout=10)
    schema = get_schema()
    requests.post(url+core_name + "/schema", json=schema).json()
    return solr

### Main

In [17]:
def main():
    try:
        all_topics_list = []

        for topic in search_topics: 
            print("finding page titles for TOPIC:",topic)
            json_list = scrape_topic_wikipedia(topic,5)
            all_topics_list.append(json_list)


        json_object = np.array(all_topics_list) 
        json_object = json_object.flatten()
        df = pd.json_normalize(json_object)

        #preprocess
        df = preprocess_summary(df,'summary')

        #save to local
        df.to_csv('./indexed_data.csv',index=False)

        #index
        collection = df.to_dict('records')
        solr = set_config()
        solr.add(collection)
    except Exeption as e:
        print(e)

In [18]:
main()

finding page titles for TOPIC: Health


NameError: name 'Exeption' is not defined

### data check

In [46]:
df = pd.read_csv('./submission/indexed_data.csv')
df.shape

(5500, 5)

In [47]:
for topic in search_topics:
    print(df[df['topic'] == topic].shape)

(550, 5)
(550, 5)
(550, 5)
(550, 5)
(550, 5)
(550, 5)
(550, 5)
(550, 5)
(550, 5)
(550, 5)


In [48]:
df[df['summary'].str.len() > 200].shape

(5285, 5)

In [57]:
df.groupby('topic')['s1'].apply(lambda x: x[x.str.len() > 200].shape[0])

topic
Economy          537
Education        519
Entertainment    524
Environment      538
Food             522
Health           528
Politics         525
Sports           532
Technology       538
Travel           520
Name: s1, dtype: int64

In [50]:
df.isna().sum()

revision_id    0
title          0
summary        2
url            0
topic          0
dtype: int64

In [58]:
df = df.dropna(subset=['summary'])
df[df['summary'].str.isalnum()].shape

(0, 6)

In [52]:
df[df['revision_id'] == 1177076134]['summary'].values

array(['De  Chinese 德 pinyin dé also written as Te is a key concept in Chinese philosophy usually translated inherent character inner power integrity in Taoism moral character virtue morality in Confucianism and other contexts and quality virtue guṇa or merit virtuous deeds puṇya in Chinese Buddhism'],
      dtype=object)

In [59]:
def remove_non_ascii_chars(text):
    return ''.join([c if ord(c) < 128 or c.isspace() or c.isdigit() else '' for c in text])




df['s1'] = df['summary'].apply(str).apply(remove_non_ascii_chars)
df

Unnamed: 0,revision_id,title,summary,url,topic,s1
0,1176891029,Cross-dressing,Crossdressing is the act of wearing clothes tr...,https://en.wikipedia.org/wiki/Cross-dressing,Health,Crossdressing is the act of wearing clothes tr...
1,1177048119,Grit (personality trait),In psychology grit is a positive noncognitive ...,https://en.wikipedia.org/wiki/Grit_(personalit...,Health,In psychology grit is a positive noncognitive ...
2,1144494427,Carnivalesque,Carnivalesque is a literary mode that subverts...,https://en.wikipedia.org/wiki/Carnivalesque,Health,Carnivalesque is a literary mode that subverts...
3,1174405995,Cardiac surgery,Cardiac surgery or cardiovascular surgery is s...,https://en.wikipedia.org/wiki/Cardiac_surgery,Health,Cardiac surgery or cardiovascular surgery is s...
4,1161352014,David Sterritt,David Sterritt born September 11 1944 is a fil...,https://en.wikipedia.org/wiki/David_Sterritt,Health,David Sterritt born September 11 1944 is a fil...
...,...,...,...,...,...,...
5495,1138664372,Discrimination testing,Discrimination testing is a technique employed...,https://en.wikipedia.org/wiki/Discrimination_t...,Food,Discrimination testing is a technique employed...
5496,1175506445,Café des Artistes,Café des Artistes was a fine restaurant at 1 W...,https://en.wikipedia.org/wiki/Caf%C3%A9_des_Ar...,Food,Caf des Artistes was a fine restaurant at 1 We...
5497,1170115037,Canal Street (Manhattan),Canal Street is a major eastwest street of ove...,https://en.wikipedia.org/wiki/Canal_Street_(Ma...,Food,Canal Street is a major eastwest street of ove...
5498,1174616120,Ethylenediaminetetraacetic acid,Ethylenediaminetetraacetic acid EDTA also call...,https://en.wikipedia.org/wiki/Ethylenediaminet...,Food,Ethylenediaminetetraacetic acid EDTA also call...


In [60]:
df[df['revision_id'] == 1177076134][['summary','s1']]

Unnamed: 0,summary,s1
5,Malta MOLtə MAWLtə Maltese ˈmɐːltɐ official...,Malta MOLt MAWLt Maltese mlt officially the...


In [36]:
import unicodedata


In [43]:
import pandas as pd

# Sample DataFrame
data = {'Text': ['Hello, 123 你好 こんにちは', 'How are you? Χαίρετε 你好 สวัสดี','De  Chinese 德 pinyin dé also ']}
df = pd.DataFrame(data)

# Function to remove non-ASCII characters
def remove_non_ascii_chars(text):
    return ''.join([c for c in text if ord(c) < 128])

# Apply the function to the 'Text' column
df['Text'] = df['Text'].apply(remove_non_ascii_chars)

# Display the cleaned DataFrame
print(df)


                          Text
0                 Hello, 123  
1              How are you?   
2  De  Chinese  pinyin d also 
