In [8]:
import pandas as pd
import json

# Loading scraped csv file : ['url', 'title', 'content']
scraped_df = pd.read_csv("constitution_articles.csv")

# Loading the JSON Data file
with open("COI.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# JSON structure: [ArtNo, Name, SubHeading, Status, Explanations, ArtDesc, Clauses]
json_articles = data[0]
json_df = pd.DataFrame(json_articles)


In [9]:
print(scraped_df.columns)
print(json_df.columns)

Index(['url', 'title', 'content'], dtype='object')
Index(['ArtNo', 'Name', 'ArtDesc', 'Clauses', 'Status', 'Explanations',
       'SubHeading'],
      dtype='object')


In [10]:
json_df.head(15)
# In json df most of the clauses, status, explanations, subheading has no values only
json_df.isnull().sum()

ArtNo            0
Name             0
ArtDesc         16
Clauses         27
Status          39
Explanations    39
SubHeading      14
dtype: int64

In [11]:
# JSON file obtained from Github has a lot of empty cells
# CSV File scraped also has some empty cells

In [12]:
scraped_df.head(10)
scraped_df.isnull().sum()

url         0
title       0
content    11
dtype: int64

In [19]:
# Convert Article numbers to strings of JSON File (for matching)
json_df['ArtNo'] = json_df['ArtNo'].astype(str)

# Trying to extract article number (the numeric value from the string) from title if possible
# e.g., if titles look like "Article 19 - Freedom of Speech" and generating a new column.
scraped_df['article_no'] = scraped_df['title'].str.extract(r'(\d+)')
# Did this to match articles number of dataframe and json file and copying content

# Merge both datasets based on article number (article_no from scraped_df & ArtNo from JSON file)
merged_df = scraped_df.merge(
    json_df[['ArtNo', 'Name', 'ArtDesc']],
    how='left',
    left_on='article_no',
    right_on='ArtNo'
)

# Till this point the two dataframes are merged with all values.

In [20]:
# Save the final merged dataset
# merged_df.to_csv("final_articles.csv", index=False, encoding="utf-8")

# print("âœ… Final dataset created successfully as 'final_articles.csv'")
merged_df.head(35)

Unnamed: 0,url,title,content,article_no,ArtNo,Name,ArtDesc
0,https://indiankanoon.org/doc/367586/,Article 14 in Constitution of India,,14,14.0,Equality before law.,The State shall not deny to any person equalit...
1,https://indiankanoon.org/doc/609295/,Article 15 in Constitution of India,(1) The State shall not discriminate against a...,15,15.0,Prohibition of discrimination on grounds of re...,
2,https://indiankanoon.org/doc/211089/,Article 16 in Constitution of India,(1) There shall be equality of opportunity for...,16,16.0,Equality of opportunity in matters of public e...,
3,https://indiankanoon.org/doc/1987997/,Article 17 in Constitution of India,,17,17.0,Abolition of Untouchability.,"""Untouchability"" is abolished and its practice..."
4,https://indiankanoon.org/doc/1163710/,Article 18 in Constitution of India,"(1) No title, not being a military or academic...",18,18.0,Abolition of titles.,
5,https://indiankanoon.org/doc/1218090/,Article 19 in Constitution of India,(1) All citizens shall have the right- (a) to ...,19,19.0,Protection of certain rights regarding freedom...,
6,https://indiankanoon.org/doc/655638/,Article 20 in Constitution of India,(1) No person shall be convicted of any offenc...,20,20.0,Protection in respect of conviction for offences.,
7,https://indiankanoon.org/doc/1199182/,Article 21 in Constitution of India,,21,21.0,Protection of life and personal liberty.,No person shall be deprived of his life or per...
8,https://indiankanoon.org/doc/581566/,Article 22 in Constitution of India,(1) No person who is arrested shall be detaine...,22,22.0,Protection against arrest and detention in cer...,
9,https://indiankanoon.org/doc/1071750/,Article 23 in Constitution of India,(1) Traffic in human beings and beggar and oth...,23,23.0,Prohibition of traffic in human beings and for...,


In [None]:
merged_df.isnull().sum()
# 11 content cells have empty values.

url            0
title          0
content       11
article_no     0
ArtNo          3
Name           3
ArtDesc       15
dtype: int64

In [26]:
# Fill missing values in scraped data with JSON data
# Filled the missing title with the "Name" field of JSON
merged_df['title'] = merged_df['title'].fillna(merged_df['Name'])
# Filled the content field with "ArtDesc" field of JSON
merged_df['content'] = merged_df['content'].fillna(merged_df['ArtDesc'])

In [27]:
merged_df.isnull().sum()

url            0
title          0
content        2
article_no     0
ArtNo          3
Name           3
ArtDesc       15
dtype: int64

In [28]:
column_order = [
    'url', 'title', 'content'
]
new_merged_df = merged_df[column_order]
new_merged_df.head(30)

Unnamed: 0,url,title,content
0,https://indiankanoon.org/doc/367586/,Article 14 in Constitution of India,The State shall not deny to any person equalit...
1,https://indiankanoon.org/doc/609295/,Article 15 in Constitution of India,(1) The State shall not discriminate against a...
2,https://indiankanoon.org/doc/211089/,Article 16 in Constitution of India,(1) There shall be equality of opportunity for...
3,https://indiankanoon.org/doc/1987997/,Article 17 in Constitution of India,"""Untouchability"" is abolished and its practice..."
4,https://indiankanoon.org/doc/1163710/,Article 18 in Constitution of India,"(1) No title, not being a military or academic..."
5,https://indiankanoon.org/doc/1218090/,Article 19 in Constitution of India,(1) All citizens shall have the right- (a) to ...
6,https://indiankanoon.org/doc/655638/,Article 20 in Constitution of India,(1) No person shall be convicted of any offenc...
7,https://indiankanoon.org/doc/1199182/,Article 21 in Constitution of India,No person shall be deprived of his life or per...
8,https://indiankanoon.org/doc/581566/,Article 22 in Constitution of India,(1) No person who is arrested shall be detaine...
9,https://indiankanoon.org/doc/1071750/,Article 23 in Constitution of India,(1) Traffic in human beings and beggar and oth...
