# Import Necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import re


## send HTTP request to site

In [2]:
url="https://www.rekhta.org/couplets"
rekhta_site=requests.get(url)
rekhta_site.status_code

200

In [3]:
soup = BeautifulSoup(rekhta_site.text, 'html.parser')
soup.prettify()

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <meta content="notranslate" name="google"/>\n  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>\n  <link href="https://rekhta.pc.cdn.bitgravity.com/content/images/favico.png" rel="icon" type="image/png"/>\n  <meta content="Rekhta" property="og:site_name"/>\n  <meta content="710470395652694" property="fb:app_id"/>\n  <meta content="@Rekhta" name="twitter:site"/>\n  <meta content="@Rekhta" name="twitter:creator"/>\n  <title>\n   Sher Shayari, Urdu Sher, Urdu SMS, Two Line Shayari, SMS shayari\n  </title>\n  <meta content="Couplet, Sher, Sher or Shayari, Best love Shayari, urdu poems, hindi best sher, urdu sher, couplet by poets, famous sher" name="keywords">\n   <meta content="Read Best Urdu Sher, HINDI SHAYARI, Listen Mp3 SHER O SHAYARI, Couplets and two line Shayari in Hindi, Urdu and Roman. Also Share this sher shayari to your friends." name="description">\n    <link href="https://www.rekhta.org/coupl

## functions to scrape all shers from the Site

In [4]:


# function to get all anchor tags to send request to each page
def extract_href(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    anchor_tags = soup.find_all('a', class_=['cpltImgCrd', 'shyriImgInner','cpltTag'])

    hrefs = [tag.get('href') for tag in anchor_tags if tag.get('href')]
    
    return hrefs


# function to extract and scrape all shers from the site
def extract_span_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    divs = soup.find_all('div', class_='c')
    
    all_data = []
    for div in divs:
        p_tags = div.find_all('p', attrs={'data-l': True})
        p_data = []
        for p in p_tags:
            spans = [span.get_text() for span in p.find_all('span')]
            p_data.extend(spans)
        all_data.append(p_data)
    
    return all_data


# function that takes list of links and scrape the data

def fetch_and_extract_data(links):
    nested_data = []
    for link in links:
        try:
            response = requests.get(link)
            response.raise_for_status()  # Ensure the request was successful
            html_content = response.text
            span_data = extract_span_data(html_content)
            nested_data.append(span_data)
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {link}: {e}")
    return nested_data



In [5]:
href_list = extract_href(rekhta_site.text)
shers=fetch_and_extract_data(href_list)
print(len(shers))

Failed to fetch /occasion: Invalid URL '/occasion': No scheme supplied. Perhaps you meant https:///occasion?
74


## save the scraped data in a file

In [6]:
def save_to_csv(nested_data, filename="output.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["SHERS"])
        for data in nested_data:
            for row in data:
                writer.writerow([row])  # Write each sublist as a new row

save_to_csv(shers)

## PreProcess and Clean the data

In [12]:
df=pd.read_csv("output.csv")
print(df.info())
df.drop_duplicates(inplace=True)
df.info()
df.tail()
df=df.iloc[::2]
df.tail()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2292 entries, 0 to 2291
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SHERS   2292 non-null   object
dtypes: object(1)
memory usage: 18.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 2046 entries, 0 to 2290
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SHERS   2046 non-null   object
dtypes: object(1)
memory usage: 32.0+ KB


Unnamed: 0,SHERS
2281,"['vo ', 'jo ', 'the ', 'shahr-e-tahayyur ', 't..."
2283,"['ai ', 'tarah-dār-e-ishva-tarāz-e-dayār-e-nāz..."
2285,"['māni-e-jāvedān-e-jāñ ', 'kuchh ', 'bhī ', 'n..."
2287,"['sad-yād-e-yād ', ""'jaun' "", 'vo ', 'hañgām-e..."
2289,"['vo ', 'hujūm-e-dil-zadgāñ ', 'ki ', 'thā ', ..."


## loaded the scrape data and clean it

In [13]:


# Load the CSV file
df = pd.read_csv("output.csv", header=None, names=["poetry"])

# Function to clean each row
def clean_text(text):
    text = re.sub(r"[\[\]',]", "", text)  # Remove brackets, commas, and apostrophes
    return text.strip()  # Ensure no extra spaces

# Apply cleaning function to each row
df["poetry"] = df["poetry"].astype(str).apply(clean_text)
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


# Save the cleaned data
df.to_csv("cleaned_filtered_poetry.csv", index=False, header=False, encoding="utf-8")

print("Poetry cleaned and saved to cleaned_poetry.csv")


Poetry cleaned and saved to cleaned_poetry.csv


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [16]:
df2=pd.read_csv("cleaned_filtered_poetry.csv")
df2.head()

Unnamed: 0,shers
0,aaj ik aur baras biit gayā us ke baġha...
1,mujhe duniyā ke ta.anoñ par kabhī ġhussa...
2,mujhe duniya ke tanon par kabhi ghussa ...
3,aaj ik aur baras biit gayā us ke baġha...
4,miir kyā saade haiñ bīmār hue jis ke s...
