# Solr Data Preprocessing

## 1. Synonyms
In this section, we will get the synonyms of the terms in the comments. First we will need to import the necessary libraries and download the WordNet dataset.

First, we need to define that we will use synonyms

In [2]:
import nltk
import pandas as pd
import requests
import json
from nltk.corpus import wordnet

In [None]:
url = "http://localhost:8983/solr/comments/schema/analysis/synonyms/en"

payload = json.dumps({
    "class": "org.apache.solr.rest.schema.analysis.ManagedSynonymGraphFilterFactory$SynonymManager"
})
headers = {'Content-Type': 'application/json'}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)

In [None]:
url = "http://localhost:8983/solr/submissions/schema/analysis/synonyms/en"

payload = json.dumps({
    "class": "org.apache.solr.rest.schema.analysis.ManagedSynonymGraphFilterFactory$SynonymManager"
})
headers = {'Content-Type': 'application/json'}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)

In [None]:
nltk.download('wordnet')

After, setting up the required libraries, we can continue with getting the unique terms from the comments in the DataFrame.

In [None]:
# Define the set to hold the words
comment_terms = set()
submission_terms = set()

# Read the comments from the csv
comments = pd.read_csv('./subreddits/all/comments_cleaned_50.csv')

# Read the submissions from the csv
submissions = pd.read_csv('./subreddits/all/submissions.csv')

# Iterate over the comments
for index, row in comments.iterrows():
    # Split the comment into words
    words = row['body'].split()
    # Add each word to the set of terms
    for w in words:
        comment_terms.add(str.lower(w))

# Iterate over the submissions
for index, row in submissions.iterrows():
    words = []

    # Check if 'selftext' is not NaN and then split into words
    if pd.notna(row['selftext']):
        words.extend(row['selftext'].split())

    # Check if 'title' is not NaN and then split into words
    if pd.notna(row['title']):
        words.extend(row['title'].split())
    # Add each word to the set of terms
    for w in words:
        submission_terms.add(str.lower(w))

In [None]:
comments_synonyms = {}
submissions_synonyms = {}

for word in comment_terms:
    comments_synonyms[word] = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            comments_synonyms[word].append(lemma.name())

for word in submission_terms:
    submissions_synonyms[word] = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            submissions_synonyms[word].append(lemma.name())

# remove the ones with empty list values
comments_synonyms = {k: v for k, v in comments_synonyms.items() if v}

# remove the ones with empty list values
submissions_synonyms = {k: v for k, v in submissions_synonyms.items() if v}


In [None]:
# Save the synonyms as a JSON file
with open('comments_synonyms.json', 'w') as f:
    json.dump(comments_synonyms, f)
with open('submissions_synonyms.json', 'w') as f:
    json.dump(submissions_synonyms, f)

In [None]:
# Read the synonyms from the JSON file
with open('comments_synonyms.json') as f:
    comments_synonyms = json.load(f)
with open('submissions_synonyms.json') as f:
    submissions_synonyms = json.load(f)

# Send request to the Solr server to add the synonyms
comments_url = "http://localhost:8983/solr/comments/schema/analysis/synonyms/en"
submissions_url = "http://localhost:8983/solr/submissions/schema/analysis/synonyms/en"
comments_payload = json.dumps(comments_synonyms)
submissions_payload = json.dumps(submissions_synonyms)
headers = {'Content-type': 'application/json'}
c_response = requests.request("PUT", comments_url, headers=headers, data=comments_payload)
s_response = requests.request("PUT", submissions_url, headers=headers, data=submissions_payload)
print(c_response.text)
print(s_response.text)

Now, let's create the necessary fields

In [None]:
url = 'http://localhost:8983/solr/submissions/schema'
headers = {'Content-type': 'application/json'}
data = {
    "add-field": {
        "name": "title",
        "type": "sgm_text_en",
        "stored": True,
        "indexed": True,
        "multiValued": False
    }
}

response = requests.post(url, headers=headers, data=json.dumps(data))
print(response.text)

In [None]:
url = 'http://localhost:8983/solr/comments/schema'
headers = {'Content-type': 'application/json'}
data = {
    "add-field": {
        "name": "body",
        "type": "sgm_text_en",
        "stored": True,
        "indexed": True,
        "multiValued": False
    }
}

response = requests.post(url, headers=headers, data=json.dumps(data))
print(response.text)

Now, we will upload the files

In [2]:
# URL of the Solr update endpoint
url = 'http://localhost:8983/solr/submissions/update'

# Headers to specify that the payload is XML
headers = {'Content-Type': 'text/csv'}

# Load the XML data from a file
with open('subreddits/all/submissions_cleaned_50.csv', 'rb') as file:
    data = file.read()

# Send the POST request with the XML data
response = requests.post(url, headers=headers, data=data)

# Print the HTTP response text
print(response.text)

{
  "responseHeader":{
    "status":0,
    "QTime":1267
  }
}


In [12]:
# URL of the Solr update endpoint
url = 'http://localhost:8983/solr/comments/update'

# Headers to specify that the payload is XML
headers = {'Content-Type': 'text/csv'}

# Load the XML data from a file
with open('./subreddits/all/comments_cleaned_50_class.csv', 'rb') as file:
    data = file.read()

# Send the POST request with the XML data
response = requests.post(url, headers=headers, data=data)

# Print the HTTP response text
print(response.text)

{
  "responseHeader":{
    "status":0,
    "QTime":55255
  }
}


## 2. Lemmatization
Now we will lemmatize the terms in the comments and add them to the synonyms.

In [9]:
# Read the synonyms JSON file
with open('comments_synonyms.json') as f:
    comments_synonyms = json.load(f)
    
# Initialize the lemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

# Lemmatize the terms
for term in comments_synonyms:
    lemmas = []
    for syn in comments_synonyms[term]:
        lemma = lemmatizer.lemmatize(syn)
        if lemma not in lemmas:
            lemmas.append(lemma)
    comments_synonyms[term].extend(lemmas)
    # Add the term itself to the list of synonyms
    comments_synonyms[term].append(term)
    # Remove duplicates
    comments_synonyms[term] = list(set(comments_synonyms[term]))
    
# Save the synonyms as a JSON file
with open('comments_synonyms_lemmatized.json', 'w') as f:
    json.dump(comments_synonyms, f)

['data_processor', 'reckoner', 'electronic_computer', 'computer', 'computers', 'figurer', 'information_processing_system', 'computing_machine', 'calculator', 'computing_device', 'estimator']


In [10]:
# Send request to the Solr server to add the synonyms
comments_url = "http://localhost:8983/solr/comments/schema/analysis/synonyms/en"
comments_payload = json.dumps(comments_synonyms)
headers = {'Content-type': 'application/json'}
c_response = requests.request("PUT", comments_url, headers=headers, data=comments_payload)
print(c_response.text)

{
  "responseHeader":{
    "status":0,
    "QTime":195
  }
}
