In [2]:
import json
import pandas as pd
import numpy as np

# Load JSON file
with open("legal_advice_india_all.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert JSON to DataFrame
df = pd.DataFrame(data)

# Show the first few rows
print(df.head())


                                               title            author  \
0  How to legally keep my father away from my wor...            divc99   
1  If the person is about to get married and secu...           notms16   
2    FNF not paid by startup even after legal notice      shadowslay97   
3  Property Ownership Between Joint Owners: Right...  Glum_Success8717   
4                Account funds put on hold Need Help          scshiv29   

                                                 url  score   created_utc  \
0  https://www.reddit.com/r/LegalAdviceIndia/comm...      2  1.738060e+09   
1  https://www.reddit.com/r/LegalAdviceIndia/comm...      1  1.738059e+09   
2  https://www.reddit.com/r/LegalAdviceIndia/comm...      3  1.738059e+09   
3  https://www.reddit.com/r/LegalAdviceIndia/comm...      2  1.738059e+09   
4  https://www.reddit.com/r/LegalAdviceIndia/comm...      1  1.738058e+09   

          created_date  num_comments  \
0  2025-01-28 10:24:09             1   
1  2025-01-2

In [3]:
df["created_date"] = pd.to_datetime(df["created_utc"], unit="s")


In [4]:
df["location"] = df["location"].replace("Unknown", np.nan)


In [5]:
df.drop_duplicates(subset=["id"], keep="first", inplace=True)


In [6]:
labour_keywords = ["labour", "employee", "work", "salary", "job", "wages", "termination", "fired"]
df_labour = df[df["title"].str.contains("|".join(labour_keywords), case=False, na=False)]


In [7]:
df_cleaned = df[["title", "author", "created_date", "score", "num_comments", "selftext", "url", "location"]]


In [8]:
df_cleaned = df[["title", "author", "created_date", "score", "num_comments", "selftext", "url", "location"]]


In [9]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary data for NLTK
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('all')

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_dat

In [10]:
def preprocess_text(text):
    if pd.isna(text):  # Handle missing values
        return ""

    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters & punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization

    return " ".join(words)


In [11]:
df_cleaned["title"] = df_cleaned["title"].apply(preprocess_text)
df_cleaned["selftext"] = df_cleaned["selftext"].apply(preprocess_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["title"] = df_cleaned["title"].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["selftext"] = df_cleaned["selftext"].apply(preprocess_text)


In [15]:
df_cleaned.to_json("preprocessed_legal_advice.json", orient="records", indent=4)
print("Preprocessed data saved successfully!")


Preprocessed data saved successfully!


In [16]:
5

5

In [13]:
import mysql.connector

df_cleaned = df_cleaned.replace({np.nan: None})

# Connect to MySQL database
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="sma_reddit"
)

cursor = conn.cursor()

# Create table if it doesn't exist
cursor.execute('''
CREATE TABLE IF NOT EXISTS legal_data (
    id INT AUTO_INCREMENT PRIMARY KEY,
    title VARCHAR(255),
    author VARCHAR(255),
    url VARCHAR(255),
    score INT,
    created_date DATETIME,
    num_comments INT,
    selftext TEXT,
    location VARCHAR(255)
)
''')

for index, row in df_cleaned.iterrows():
    cursor.execute('''
        INSERT INTO legal_data (title, author, url, score, created_date, num_comments, selftext, location)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    ''', (
        row['title'],
        row['author'],
        row['url'],
        row['score'],
        row['created_date'],
        row['num_comments'],
        row['selftext'],
        row['location']
    ))

conn.commit()
conn.close()

